# Exploratory Analisys
In this notebook will be performed the exploratory analysis of the data.

## Libraries

In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import country_converter as coco
import re

## Load Data

In [323]:
claims_df = pd.read_csv('./data/postprocessed/claims.csv')

In [324]:
answers_df = pd.read_csv('./data/postprocessed/answers.csv')

In [325]:
# lowercase the speaker column in the dataframe
for speaker in claims_df['speaker']:   
    claims_df.loc[claims_df['speaker'] == speaker, 'speaker'] = str(speaker).lower()

In [345]:
# remove null values from the speaker column
claims_df = claims_df.dropna(subset=['speaker'])
# delete special characters from the speaker column
claims_df['speaker'] = claims_df['speaker'].str.replace(r'[^\w\s]', '')
# remove spaces from the speaker column
claims_df['speaker'] = claims_df['speaker'].str.replace(' ', '')

In [346]:
def remove_titles(text):
    # Define a list of common English titles and political titles (all lowercase)
    common_titles = ['mr', 'mrs', 'ms', 'dr', 'prof', 'rev', 'capt', 'col', 'maj', 'sir', 'madam', 'miss']
    political_titles = ['president', 'vice president', 'prime minister', 'senator', 'congressman', 'congresswoman', 'governor', 'mayor']

    # Define a list of two-letter country codes (all lowercase)
    country_codes = ['us', 'uk', 'au', 'ca', 'fr', 'de', 'jp', 'cn', 'in']  # Add more as needed

    # Combine all titles and country codes into one list
    all_titles = common_titles + political_titles + country_codes

    # Construct a regular expression pattern to match titles and country codes
    pattern = r'\b(?:' + '|'.join(all_titles) + r')\b'

    # Use re.sub() to replace the matched titles with an empty string
    text_without_titles = re.sub(pattern, '', text, flags=re.IGNORECASE)

    return text_without_titles

# Apply the function to the speaker column
claims_df['speaker'] = claims_df['speaker'].apply(remove_titles)

## Claims Analisys

In [347]:
# Collect the number of unique labels, speakers, locations
u_labels = claims_df['label'].unique()
n_labels = claims_df['label'].nunique()
u_speakers = claims_df['speaker'].unique()
n_speakers = claims_df['speaker'].nunique()
u_locations = claims_df['location_ISO_code'].unique()
n_locations = claims_df['location_ISO_code'].nunique()

print(f'Number of unique labels: {n_labels}')
print(f'Labels:\n {u_labels}')
print(f'Number of unique speakers: {n_speakers}')
print(f'Speakers:\n {u_speakers}')
print(f'Number of unique locations: {n_locations}')
print(f'Locations:\n {u_locations}')

Number of unique labels: 4
Labels:
 ['Supported' 'Refuted' 'Conflicting Evidence/Cherrypicking'
 'Not Enough Evidence']
Number of unique speakers: 1261
Speakers:
 ['pambondi' 'erictrump' 'railaodinga' ... 'stonyrushing' 'sunilbassi'
 'nikkisweightlosssupport']
Number of unique locations: 53
Locations:
 ['US' 'KE' 'IN' 'NG' 'NZ' nan 'GH' 'ZA' 'HK' 'GB' 'PH' 'KR' 'CA' 'SE' 'RU'
 'TR' 'PK' 'AU' 'TW' 'MM' 'DE' 'IE' 'IR' 'ES' 'CH' 'CN' 'TZ' 'MX' 'UG'
 'IL' 'KP' 'IT' 'SA' 'AE' 'MG' 'RS' 'LK' 'MY' 'JP' 'VA' 'TH' 'ET' 'ZW'
 'SY' 'BW' 'UA' 'ID' 'SZ' 'FR' 'PL' 'MZ' 'MW' 'EG' 'BB']


### Claims by State

In [348]:
# detect the countries with the most refuted claims
refuted_df = claims_df[claims_df['label'] == 'Refuted']
# group by location_ISO_code and count the number of refuted claims
refuted_by_state = refuted_df.groupby('location_ISO_code').size().reset_index(name='count')
refuted_by_state

Unnamed: 0,location_ISO_code,count
0,AE,1
1,AU,22
2,BB,1
3,BW,1
4,CA,7
5,CH,1
6,CN,19
7,DE,2
8,EG,1
9,ES,2


In [349]:
# get the overall claims by state
claims_by_state =  claims_df.groupby('location_ISO_code').size().reset_index(name='count')
claims_by_state

Unnamed: 0,location_ISO_code,count
0,AE,2
1,AU,30
2,BB,1
3,BW,1
4,CA,8
5,CH,2
6,CN,29
7,DE,3
8,EG,1
9,ES,2


In [350]:
# get the percentage of refuted claims by state matching by location_ISO_code
refuted_by_state = refuted_by_state.merge(claims_by_state, on='location_ISO_code')
refuted_by_state['percentage'] = refuted_by_state['count_x'] / refuted_by_state['count_y']
refuted_by_state

Unnamed: 0,location_ISO_code,count_x,count_y,percentage
0,AE,1,2,0.5
1,AU,22,30,0.733333
2,BB,1,1,1.0
3,BW,1,1,1.0
4,CA,7,8,0.875
5,CH,1,2,0.5
6,CN,19,29,0.655172
7,DE,2,3,0.666667
8,EG,1,1,1.0
9,ES,2,2,1.0


In [351]:
# rename the columns
refuted_by_state = refuted_by_state.rename(columns={'count_x': 'refuted', 'count_y': 'total'})

In [352]:
# sort the states by percentage of refuted claims
refuted_by_state = refuted_by_state.sort_values(by='percentage', ascending=False)
refuted_by_state

Unnamed: 0,location_ISO_code,refuted,total,percentage
38,TH,4,4,1.0
25,MM,1,1,1.0
41,TZ,1,1,1.0
42,UA,3,3,1.0
20,JP,4,4,1.0
39,TR,2,2,1.0
27,MX,3,3,1.0
43,UG,1,1,1.0
28,MY,1,1,1.0
15,ID,1,1,1.0


In [353]:
# filter the states with more than 50 claims
refuted_by_state = refuted_by_state[refuted_by_state['total'] > 5]
refuted_by_state

Unnamed: 0,location_ISO_code,refuted,total,percentage
24,LK,10,11,0.909091
4,CA,7,8,0.875
32,PH,29,34,0.852941
1,AU,22,30,0.733333
18,IN,178,263,0.676806
14,HK,4,6,0.666667
6,CN,19,29,0.655172
16,IE,7,11,0.636364
35,RU,17,29,0.586207
44,US,591,1025,0.576585


In [354]:
# translate iso code using country converter to ISO-3
iso2_states = np.array(refuted_by_state['location_ISO_code'])
iso3_states = coco.convert(names=iso2_states, to='ISO3')

In [355]:
# add the iso3 code to the dataframe using the correct order
for i in range(len(iso3_states)):
    refuted_by_state.loc[refuted_by_state['location_ISO_code'] == iso2_states[i], 'location_ISO_code'] = iso3_states[i]

In [356]:
# build a map with the percentage of refuted claims by state
fig = px.choropleth(refuted_by_state, locations='location_ISO_code', locationmode='ISO-3', color='percentage', hover_name='location_ISO_code', color_continuous_scale='Viridis', title='Percentage of refuted claims by state')
fig.show()

### Claims by Speaker

In [357]:
# detect the speakers with the most refuted claims
refuted_by_speaker = refuted_df.groupby('speaker').size().reset_index(name='count')
refuted_by_speaker

Unnamed: 0,speaker,count
0,"""crystal,",1
1,.deborahbirx,1
2,.jeonkwang-hun,1
3,.juliusokojie,1
4,.mattmccarthy,1
...,...,...
852,प्रशान्तिन्यूज़पेपर,1
853,අපිහෙළයෝ,1
854,රොබින්හුඩ්-robinhood,1
855,හොරොව්පතානපාරම්පරිකහෙළවෙදගෙදර,1


In [358]:
# get the overall claims by speaker
claims_by_speaker =  claims_df.groupby('speaker').size().reset_index(name='count')
claims_by_speaker

Unnamed: 0,speaker,count
0,,1
1,"""crystal,",1
2,.ayodolaadigun,1
3,.deborahbirx,1
4,.isaibrahim-ministerofcommunicationsanddigital...,1
...,...,...
1256,प्रशान्तिन्यूज़पेपर,1
1257,අපිහෙළයෝ,1
1258,රොබින්හුඩ්-robinhood,1
1259,හොරොව්පතානපාරම්පරිකහෙළවෙදගෙදර,1


In [359]:
# get the percentage of refuted claims by speaker matching by speaker
refuted_by_speaker = refuted_by_speaker.merge(claims_by_speaker, on='speaker')
refuted_by_speaker['percentage'] = refuted_by_speaker['count_x'] / refuted_by_speaker['count_y']
refuted_by_speaker

Unnamed: 0,speaker,count_x,count_y,percentage
0,"""crystal,",1,1,1.0
1,.deborahbirx,1,1,1.0
2,.jeonkwang-hun,1,1,1.0
3,.juliusokojie,1,2,0.5
4,.mattmccarthy,1,1,1.0
...,...,...,...,...
852,प्रशान्तिन्यूज़पेपर,1,1,1.0
853,අපිහෙළයෝ,1,1,1.0
854,රොබින්හුඩ්-robinhood,1,1,1.0
855,හොරොව්පතානපාරම්පරිකහෙළවෙදගෙදර,1,1,1.0


In [360]:
# rename the columns
refuted_by_speaker = refuted_by_speaker.rename(columns={'count_x': 'refuted', 'count_y': 'total'})

In [361]:
# sort the speakers by percentage of refuted claims
refuted_by_speaker = refuted_by_speaker.sort_values(by='percentage', ascending=False)
refuted_by_speaker

Unnamed: 0,speaker,refuted,total,percentage
0,"""crystal,",1,1,1.000000
542,munnasingh,1,1,1.000000
545,myanmarcelestar,1,1,1.000000
546,najibbalala.kenya'scabinetsecretaryfortourisma...,1,1,1.000000
547,namoindia,1,1,1.000000
...,...,...,...,...
210,cyrilramaphosa,10,74,0.135135
337,henryrotich,2,15,0.133333
204,corybooker,1,10,0.100000
538,muhammadubuhari,1,12,0.083333


In [362]:
# filter the speakers with more than 5 claims
refuted_by_speaker = refuted_by_speaker[refuted_by_speaker['total'] > 10]
refuted_by_speaker

Unnamed: 0,speaker,refuted,total,percentage
250,donaldtrump,163,250,0.652
702,socialmediausers,8,14,0.571429
441,kyledarbyshire,8,16,0.5
833,williamruto,13,28,0.464286
86,aljazeera,5,11,0.454545
703,socratesmbamalu,12,28,0.428571
841,wycliffeoparanya,16,46,0.347826
127,atikuabubakar,4,13,0.307692
141,berniesanders,8,33,0.242424
795,uhurukenyatta,13,55,0.236364


In [363]:
# build a bar plot with the percentage of refuted claims by speaker
fig = px.bar(refuted_by_speaker, x='speaker', y='percentage', title='Percentage of refuted claims by speaker', labels={'speaker': 'Speaker', 'percentage': 'Percentage of refuted claims'})
fig.show()