In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

clips = pd.read_csv('data-files/transcript_data_combined.csv')
clips['Transcript'] = clips['Transcript'].fillna('')
lies = pd.read_csv('data-files/all_speech_lies.csv')
lies["Lie Quote"] = lies["Lie Quote"].fillna('')
print(lies.columns)

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Vectorize all the lies first (fit_transform on lies)
lie_vector = vectorizer.fit_transform(lies['Lie Quote'])

# Initialize an empty list to store the maximum similarity and corresponding lie for each transcript
max_similarities = []
most_similar_lies = []

# Loop through each transcript manually
for transcript in clips['Transcript']:
    
    # Transform the transcript into the same vector space as the lies (use transform on the transcript)
    transcript_vector = vectorizer.transform([transcript])
    
    # Calculate cosine similarities between the current transcript and all lies
    similarities = cosine_similarity(transcript_vector, lie_vector)
    
    # Find the maximum similarity value and the index of the corresponding lie
    max_similarity = similarities.max()
    max_index = similarities.argmax()
    
    # Append the max similarity and the corresponding lie to the lists
    max_similarities.append(max_similarity)
    most_similar_lies.append(lies['Lie Quote'].iloc[max_index])

# Add the results to the original DataFrame
clips['Max Cosine Similarity'] = max_similarities
clips['Most Similar Lie'] = most_similar_lies

# Separate the clips based on cosine similarity threshold
lies_detected = clips[clips['Max Cosine Similarity'] > 0.8]
truths_detected = clips[clips['Max Cosine Similarity'] <= 0.8]


ModuleNotFoundError: No module named 'pandas'