# Lie Detection

In [31]:
# imports
import pandas as pd

In [32]:
# necessary files
clip_transcripts = pd.read_csv('transcript_data_combined.csv')
lies = pd.read_csv('all_speech_lies.csv')
speech_info = pd.read_csv('speech_info.csv')

# create link between transcript clips and lies using speech_info
speech_info['Video ID'] = speech_info['YouTube Speech Link'].str.extract(r'v=([a-zA-Z0-9_-]+)')
vidID_to_speechID = dict(zip(speech_info['Video ID'], speech_info['Speech ID'],))
# get video id for transcripts
clip_transcripts = pd.read_csv('transcript_data_combined.csv')
clip_transcripts['Video ID'] = clip_transcripts['YouTube URL'].str.extract(r'youtu\.be/([a-zA-Z0-9_-]+)')
# replace video id with speech id and save to new column
clip_transcripts['Speech ID'] = clip_transcripts['Video ID'].replace(vidID_to_speechID).astype('int')

# display final files to use
display(clip_transcripts)
display(lies)
print('Note: the linking column between these tables is the "Speech ID". This can be used for a sanity check later.')

Unnamed: 0,YouTube URL,Chunk Number,Chunk Filename,Transcript,Video ID,Speech ID
0,https://youtu.be/-ofJu78Wpn0?si=H_BZ0KSYr_raPUac,1,/Users/milanvaghani/Desktop/Unstructed Machine...,,-ofJu78Wpn0,1
1,https://youtu.be/-ofJu78Wpn0?si=H_BZ0KSYr_raPUac,2,/Users/milanvaghani/Desktop/Unstructed Machine...,good evening everyone. Good evening good even...,-ofJu78Wpn0,1
2,https://youtu.be/-ofJu78Wpn0?si=H_BZ0KSYr_raPUac,3,/Users/milanvaghani/Desktop/Unstructed Machine...,thank you thank you thank you thank you thank...,-ofJu78Wpn0,1
3,https://youtu.be/-ofJu78Wpn0?si=H_BZ0KSYr_raPUac,4,/Users/milanvaghani/Desktop/Unstructed Machine...,thank you all thank you all okay we got to g...,-ofJu78Wpn0,1
4,https://youtu.be/-ofJu78Wpn0?si=H_BZ0KSYr_raPUac,5,/Users/milanvaghani/Desktop/Unstructed Machine...,thank you everyone thank you everyone thank yo...,-ofJu78Wpn0,1
...,...,...,...,...,...,...
533,https://youtu.be/Q5TIZnhXX7Q?si=agpeKU04EQ-AtdJ_,177,/Users/milanvaghani/Desktop/Unstructed Machine...,the Liberty Bell it's where the Army whether i...,Q5TIZnhXX7Q,3
534,https://youtu.be/Q5TIZnhXX7Q?si=agpeKU04EQ-AtdJ_,178,/Users/milanvaghani/Desktop/Unstructed Machine...,this is the place where Pennsylvania Patriots ...,Q5TIZnhXX7Q,3
535,https://youtu.be/Q5TIZnhXX7Q?si=agpeKU04EQ-AtdJ_,179,/Users/milanvaghani/Desktop/Unstructed Machine...,come our way no matter what obstacles we must ...,Q5TIZnhXX7Q,3
536,https://youtu.be/Q5TIZnhXX7Q?si=agpeKU04EQ-AtdJ_,180,/Users/milanvaghani/Desktop/Unstructed Machine...,we will make America healthy again we will ma...,Q5TIZnhXX7Q,3


Unnamed: 0,Speech ID,Lie Quote
0,1,"His explicit intent to jail journalists, polit..."
1,1,we know and we know what a second Trump term w...
2,1,We're not going back to when Donald Trump trie...
3,1,We are not going to let him end programs like ...
4,1,"as president, I will bring together labor and ..."
...,...,...
109,6,"You could do abortions in the seventh month, t..."
110,6,was one of the most incompetently handled situ...
111,6,There is not one member of the military who is...
112,6,I will not ban fracking. I have not banned fra...


Note: the linking column between these tables is the "Speech ID". This can be used for a sanity check later.


In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

clip_transcripts = pd.read_csv('transcript_data_combined.csv')
lies = pd.read_csv('all_speech_lies.csv')
speech_info = pd.read_csv('speech_info.csv')

speech_info['Video ID'] = speech_info['YouTube Speech Link'].str.extract(r'v=([a-zA-Z0-9_-]+)')
vidID_to_speechID = dict(zip(speech_info['Video ID'], speech_info['Speech ID']))
clip_transcripts['Video ID'] = clip_transcripts['YouTube URL'].str.extract(r'youtu\.be/([a-zA-Z0-9_-]+)')
clip_transcripts['Speech ID'] = clip_transcripts['Video ID'].replace(vidID_to_speechID).astype(int)

# Merging lies and transcripts data on 'Speech ID'
merged_data = pd.merge(lies, clip_transcripts, on='Speech ID', how='inner')

# Replacing the missing values with empty strings
merged_data['Lie Quote'] = merged_data['Lie Quote'].fillna('')
merged_data['Transcript'] = merged_data['Transcript'].fillna('')

# Vectorizing Lie Quotes and Transcripts individually using CountVectorizer
vectorizer = CountVectorizer()
lie_vector = vectorizer.fit_transform(merged_data['Lie Quote'])
transcript_vector = vectorizer.transform(merged_data['Transcript'])

# Calculating cosine similarities
cosine_similarities = cosine_similarity(lie_vector, transcript_vector)

# Finding the max cosine similarity for each transcript with any lie
merged_data['Cosine Similarity'] = cosine_similarities.max(axis=0)

# Defining the cutoff threshold (can be altered if needed)
cosine_similarity_cutoff = 0.8

# Classifying each transcript as a lie or truth based on the cutoff
merged_data['Classification'] = merged_data['Cosine Similarity'].apply(
    lambda x: 'Lie' if x >= cosine_similarity_cutoff else 'Truth'
)

# Grouping by Transcript and getting the max similarity and classification (lie or truth)
max_similarity_per_transcript = merged_data.groupby('Transcript').agg({
    'Cosine Similarity': 'max',
    'Classification': lambda x: 'Lie' if (x == 'Lie').any() else 'Truth'
}).reset_index()

# Saving results to a CSV file
max_similarity_per_transcript.to_csv('transcript_lie_truth_classification.csv', index=False)

# Total counts of lies and truths
classification_counts = merged_data['Classification'].value_counts()

print("Classification Counts:\n", classification_counts)

# Displaying the first five instances of lies and truths
lies = merged_data[merged_data['Classification'] == 'Lie']
truths = merged_data[merged_data['Classification'] == 'Truth']

print("\nLies:")
print(lies[['Lie Quote', 'Transcript', 'Cosine Similarity']].head())

print("\nTruths:")
print(truths[['Lie Quote', 'Transcript', 'Cosine Similarity']].head())

Classification Counts:
 Truth    5614
Lie       120
Name: Classification, dtype: int64

Lies:
                                            Lie Quote  \
37  His explicit intent to jail journalists, polit...   
45  His explicit intent to jail journalists, polit...   
47  His explicit intent to jail journalists, polit...   
52  His explicit intent to jail journalists, polit...   
56  His explicit intent to jail journalists, polit...   

                                           Transcript  Cosine Similarity  
37  who assaulted those law enforcement officers a...           0.816497  
45  Aurora area small town or big city and as pres...           0.937410  
47  his billionaire friends and he will give them ...           0.822581  
52  because of Donald Trump  and understand he is ...           0.924014  
56  and live free from the pollution that fuels th...           0.854482  

Truths:
                                           Lie Quote  \
0  His explicit intent to jail journalists, poli