## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix

import nltk
import spacy
from googletrans import Translator

In [3]:
from googletrans import Translator
translator = Translator()

def translate_column(df, column='source_title'):
    translated = []
    for i, text, lang in zip(df.index, df[column], df['lang']):
        if lang != 'en' and text == text:
            if lang != lang:
                lang = None
            try:
                translated += [translator.translate(text, src=lang, dest='en').text]
            except:
                translated += ['']
                print('ERROR AT {}'.format(i), text, lang)
        else:
            translated += [text]
        if i % 250 == 0:
            print('translated {} lines'.format(i+1))
    return translated

In [4]:
# translate train
#df_trans = pd.DataFrame()
df['source_title_en'] = translate_column(df)
df.to_csv('./train_data.csv')

translated 1 lines
ERROR AT 20 VoxCheck - ТЕРМІНОВО, НАМ ВСІМ БРЕШУТЬ, ВСЯ ПРАВДА ПРО... | Facebook nan
ERROR AT 30  Página no encontrada - El Surtidor nan
ERROR AT 33 Misbar - مسبار nan
ERROR AT 49 ပရုတ်ဆီကို ရေနွေးနဲ့ဖျော်ပြီး ထွက်လာတဲ့အငွေ့ကို ရှုခြင်းက COVID-19 ကို ကုသနိုင်တယ်ဆိုတဲ့ သတင်းမှား - Factcrescendo Myanmar nan
ERROR AT 76 VoxCheck - "Терміново! Сенсаційна новина потрясла... | Facebook nan
ERROR AT 109 Facebook nan
ERROR AT 122 
	[팩트체크] ① 마스크만 쓴 확진자…이송 지침 위반? | JTBC 뉴스
 nan
ERROR AT 137 Misbar - مسبار nan
ERROR AT 164 VoxCheck - Епідемії кожні 100 років! Співпадіння? — не... | Facebook nan
ERROR AT 197 ආලෝකමත් වූ නෙලුම් කුළුන අප්‍රේල් 11 කුරුණෑගලට දිස්වු ඡායාරූපක් නොවේ! - Factcrescendo Sri Lanka nan
ERROR AT 220 Misbar - مسبار nan
ERROR AT 234 
	[팩트체크] 일본은 40장씩 마스크 무료 배포? 직접 확인해보니 | JTBC 뉴스
 nan
translated 251 lines
ERROR AT 271 Misbar - مسبار nan
ERROR AT 278 အသက် ၁၀ စက္ကန့်အောင့်ထားနိုင်ရုံနဲ့ ကိုရိုနာဗိုင်းရပ်စ်ရှိမရှိမသိရှိနိုင်ပါ - Factcrescendo Myanmar nan
ERROR AT 3

In [6]:
# translate test
#df_trans = pd.DataFrame()
df_test['source_title_en'] = translate_column(df_test)
df_test.to_csv('./test_data.csv', index=False)

translated 1 lines
ERROR AT 48 
	[팩트체크] "중국 당국이 예방책으로 마늘 꼽았다?" 허위정보 검증 | JTBC 뉴스
 None
ERROR AT 52 Misbar - مسبار None
ERROR AT 58 ပိုဗီဒုန်း အိုင်အိုဒင်း(Povidone-Iodine)ဟာ COVID-19 ကို ကုသနိုင်၊ ကာကွယ်နိုင်တယ် ဆိုတဲ့ အထောက်အထားခိုင်ခိုင်မာမာမရှိသေးပါ - Factcrescendo Myanmar None
ERROR AT 66 නාවුක හමුදා සෙබළුන්ට වෛද්‍ය ඒලියන්ත වයිට්ගේ ප්‍රතිකාරයක් ලබාදෙන්නේ නැත! - Factcrescendo Sri Lanka None
ERROR AT 76 ඉන්දීය සුපිරි නළු අමීර් කාන් පාන්පිටි මලු වල සඟවා මුදල් බෙදා ඇති බවට සමාජ මාධ්‍ය තුල සංසරනය ව්‍යාජ පුවතක් - Factcrescendo Sri Lanka None
ERROR AT 98 ප්‍රභූ PCR පරීක්ෂණ1000 කට අධික ප්‍රමාණයක් දිනයකදී සිදුවූ බවට කළ සාවද්‍ය ප්‍රකාශයක්! - Factcrescendo Sri Lanka None
ERROR AT 118 အများပြည်သူ ထိတ်လန့်အောင် လှုံ့ဆော်တဲ့ သတင်းအတု None
ERROR AT 136 COVID-19 လူနာနဲ့ထိတွေ့ခဲ့တဲ့ သမ္မတနှင့် အတိုင်ပင်ခံပုဂ္ဂိုလ် ၁၄ ရက် self-quarantine လုပ်နေရဆိုတဲ့ သတင်းမှား - Factcrescendo Myanmar None
ERROR AT 137 COVID-19 ကြောင့် အီကွေဒေါနိုင်ငံ ပျက်သုဉ်းပြီဆိုတဲ့သတင်းမှား - Factcrescendo Myanmar None
ERROR AT

In [118]:
(df['title'].str.count('[!@#$%^&*()_+\-=\[\]{}\\|<>\/?]')/df['title'].str.len())

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
5526    0.000000
5527    0.000000
5528    0.000000
5529    0.030769
5530    0.011765
Name: title, Length: 5531, dtype: float64

## Load data

In [5]:
df = pd.read_csv('data/covid19_data.csv')
df_test = pd.read_csv('data/covid19_unlabelled_test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [7]:
df.head(3)

Unnamed: 0,verifiedby,country,class,title,published_date,country1,country2,country3,country4,article_source,ref_source,source_title,content_text,category,lang
0,Delfi Melo Detektorius (Lie Detector),Lithuania,False,Claims that coronavirus is fake and Belarus i...,2020/05/11,Lithuania,,,,https://www.delfi.lt/news/melo-detektorius/mel...,poynter,Baltarusija atšventė nesisaugodama nuo koronav...,Gegužės 9-ąją Baltarusijos sostinėje Minske su...,,lt
1,AfricaCheck,"United States, South Africa",False,Muammar Gaddafi predicted the current coronav...,2020/03/21,United States,South Africa,,,https://africacheck.org/fbcheck/no-gaddafi-did...,poynter,"No, Gaddafi didn’t predict coronavirus, but di...",As the world struggles to bring the new corona...,,en
2,AFP,Hong Kong,False,Video shows quarantined people in a building ...,2020/02/17,Hong Kong,,,,http://u.afp.com/QuarantineChina,poynter,"This video shows tower blocks in Shanghai, not...","It has been viewed more than 200,000 times., T...",,en


In [8]:
df.describe()

Unnamed: 0,verifiedby,country,class,title,published_date,country1,country2,country3,country4,article_source,ref_source,source_title,content_text,category,lang
count,5531,5306,5529,5531,5531,5306,451,93,29,5524,5531,5189,4938,225,5281
unique,96,344,60,5404,246,108,85,42,25,5472,2,4838,4752,14,39
top,AFP,India,False,Does Trump Own Stock in a Company CDC Uses for...,2020/03/17,India,North Africa,United States,Spain,http://www.ecuadorchequea.com/las-mentiras-sob...,poynter,Misbar - مسبار,Las mentiras sobre este tema abarcan desde la ...,US Government Response,en
freq,515,1054,2110,4,102,1062,106,8,3,17,5306,79,43,36,2069


In [9]:
df['class'].value_counts(normalize=True)

False                                 0.381624
FALSE                                 0.376017
Misleading                            0.083740
MISLEADING                            0.031290
Mostly false                          0.018267
Partly false                          0.017906
misleading                            0.014469
No evidence                           0.010309
Mostly False                          0.007777
Mixture                               0.007415
True                                  0.004522
No Evidence                           0.004341
Explanatory                           0.004341
News                                  0.004160
PARTLY FALSE                          0.003979
Unproven                              0.002170
MOSTLY FALSE                          0.001990
Miscaptioned                          0.001809
Partly False                          0.001809
mostly false                          0.001628
partly false                          0.001628
Mostly True  

## Preprocess
### Labels

In [57]:
def true_or_false(x):
    if x.lower() in ["correct", "true", "explanatory", "correct attribution", "news"]:
        x = "true"
    else:
        x = "false"
    return x

df['label'] = df["class"].apply(str).apply(true_or_false)

In [58]:
def build_labels(df, true_list=['True']):
    y = df['class'].copy()
    y.loc[[i in true_list for i in df['class']]] = 'true'
    return y

In [59]:
df['label'].value_counts(normalize=True)

false    0.985717
true     0.014283
Name: label, dtype: float64

## Features

## Baseline

In [96]:
# Split in train and validation
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Build the pipeline
clf = Pipeline([('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier(random_state = 42))])

# Train the classifier
clf.fit(map(str, df_train['title'].values), df_train['label'].values)

# Predict train data
predicted_train = clf.predict(map(str, df_train['title'].values))
print('Train score: ', np.mean(predicted_train == df_train['label']))

# Predict validation data
predicted_val = clf.predict(map(str, df_val['title'].values))
print('Validation score: ', np.mean(predicted_val == df_val['label']))

# Predict test data
predicted_test = clf.predict(map(str, df_test['title'].values))

Train score:  1.0
Validation score:  0.987353206865402


In [21]:
def build_submission(predictions):
    submission = pd.DataFrame()
    submission['class'] = predictions
    submission.to_csv('./submission.csv', index=False)
    return submission

In [22]:
submission = build_submission(predicted_test)
submission

Unnamed: 0,class
0,false
1,false
2,false
3,false
4,false
...,...
2087,false
2088,false
2089,false
2090,false


In [17]:
submission.to_csv('./submission.csv')

In [106]:
f1_score(df_val['label'], predicted_val)

  if pos_label not in present_labels:


ValueError: pos_label=1 is not a valid label: array(['false', 'true'], dtype='<U5')

In [103]:
f1_score(y_true=df_val["label"].values, y_pred=predicted_val, pos_label="true")

0.0

In [88]:
build_submission(predicted_test).value_counts()

class
false    2090
true        2
dtype: int64

In [73]:
sample_submission

Unnamed: 0,class
0,False
1,True
2,True
3,True
4,False
5,False
6,False
7,True
8,True
9,True
