In [2]:
import numpy as np 
import pandas as pd 
import sys 
from tqdm import tqdm 
import os 
from random import sample 
from annoy import AnnoyIndex 
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from sklearn import metrics 

In [4]:
train_file = './Data/train.csv'
train_enriched_file = './Data/train_enriched.csv'
test_file = './Data/test.csv'
test_enriched = './Data/test_enriched.csv'
train_emd = './Data/train_emb.csv'
test_emd = './Data/test_emb.csv'
embeddings_dim = 384

In [5]:
df =      pd.read_csv(train_emd, encoding='utf-8')
df_test = pd.read_csv(test_emd, encoding='utf-8')
df_true = pd.read_csv('./Data/submission.csv', encoding='utf-8')

In [6]:
y = df_true['target']

In [7]:
df.sample(2)

Unnamed: 0,id,keyword,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,text_clean,keyword_clean,newtext,tx_key,emb
1014,1483,body%20bags,Child Shoulder Bags PVC Shoulder Book Bag Cart...,0,15,13,121,10,0,0,2,0.0,0.0,0.024390243902439,child shoulder bag pvc shoulder book bag carto...,bodi bag,child shoulder bag pvc shoulder book bag carto...,Child Shoulder Bags PVC Shoulder Book Bag Cart...,"[-0.031108854338526726, 0.016074178740382195, ..."
4880,7071,meltdown,@nprfreshair I really can't believe he is skip...,0,15,15,104,8,0,1,0,-0.05625,0.44375,0.1515151515151515,realli can't believ skip republican meltdown....,meltdown,realli can't believ skip republican meltdown....,@nprfreshair I really can't believe he is skip...,"[0.05379693955183029, -0.01638420671224594, 0...."


In [8]:
df.drop(columns=['keyword', 'text','text_clean','keyword_clean','newtext','tx_key'], inplace=True)

In [10]:
df.sample(2)

Unnamed: 0,id,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,emb
2437,3531,1,11,11,83,5,0,0,1,-0.2,0.0,1.0,"[-0.05146445333957672, 0.034089282155036926, -..."
2596,3764,0,9,9,47,1,0,0,0,0.6,1.0,0.1764705882352941,"[0.010588926263153553, 0.01868719793856144, 0...."


In [33]:
emb_list = df['emb'].to_list()
id_list = df['id'].to_list()

In [34]:
embeddings = []
for item in emb_list:
    item = item.strip('][').split(', ')
    item = list(map(float,item))
    embeddings.append(item)

In [35]:
print(type(emb_list))
print(type(id_list))
print(type(embeddings[0]))
print(type(embeddings[0][0]))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'float'>


In [37]:
df_emb = pd.DataFrame()
for item, i in zip(embeddings,id_list):
    
    df_temp = pd.DataFrame()
    d = {i: [i, np.sum(item), np.mean(item), np.min(item), np.max(item)]}
    df_temp = pd.DataFrame.from_dict(d, orient = 'index', 
                                     columns=['emd_id',
                                              'sum_emb', 
                                              'avg_emb', 
                                              'min_emb', 
                                              'max_emb', ])
    df_emb = pd.concat([df_emb,df_temp])    

In [38]:
df_emb.head()

#df['emb_max'] = df['emb'].apply(lambda e: np.mean(e))

Unnamed: 0,emd_id,sum_emb,avg_emb,min_emb,max_emb
1,1,-0.13811,-0.00036,-0.153698,0.137824
4,4,0.288894,0.000752,-0.141703,0.149274
5,5,-0.414505,-0.001079,-0.147163,0.131376
6,6,0.299149,0.000779,-0.148119,0.133969
7,7,0.075128,0.000196,-0.133673,0.138465


In [40]:
df.drop(columns=['emb'],inplace=True)

In [41]:
df = df.merge(df_emb, how='left', left_on='id', right_on='emd_id')

In [42]:
df.head()

Unnamed: 0,id,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,emd_id,sum_emb,avg_emb,min_emb,max_emb
0,1,1,13,13,69,1,1,0,0,0.0,0.0,no_keyword,1,-0.13811,-0.00036,-0.153698,0.137824
1,4,1,7,7,38,1,0,0,0,0.1,0.4,no_keyword,4,0.288894,0.000752,-0.141703,0.149274
2,5,1,22,20,133,3,0,0,0,-0.01875,0.3875,no_keyword,5,-0.414505,-0.001079,-0.147163,0.131376
3,6,1,8,8,65,2,1,0,0,0.0,0.0,no_keyword,6,0.299149,0.000779,-0.148119,0.133969
4,7,1,16,15,88,2,2,0,0,0.0,0.0,no_keyword,7,0.075128,0.000196,-0.133673,0.138465


In [44]:
df.drop(columns=['id', '@_count', 'URL_count'],inplace=True) #'emd_id'

In [48]:
fillva = pd.to_numeric(df['dis%'],errors='coerce').median()
fillva

0.3783783783783784

In [49]:
df['dis%'] = df['dis%'].apply(lambda x: fillva if x == 'no_keyword' else x)
df.head()

Unnamed: 0,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,sentiment,subjectivity,dis%,sum_emb,avg_emb,min_emb,max_emb
0,1,13,13,69,1,1,0.0,0.0,0.378378,-0.13811,-0.00036,-0.153698,0.137824
1,1,7,7,38,1,0,0.1,0.4,0.378378,0.288894,0.000752,-0.141703,0.149274
2,1,22,20,133,3,0,-0.01875,0.3875,0.378378,-0.414505,-0.001079,-0.147163,0.131376
3,1,8,8,65,2,1,0.0,0.0,0.378378,0.299149,0.000779,-0.148119,0.133969
4,1,16,15,88,2,2,0.0,0.0,0.378378,0.075128,0.000196,-0.133673,0.138465


## <font color = 'dark green'> classification models

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [52]:
X = df.drop(columns=['target'])
y = df['target']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [105]:
tree = DecisionTreeClassifier(max_depth=5)
forest = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1)
ada = AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=1.1)
knn = KNeighborsClassifier(20)

In [106]:
ada.fit(X_train,y_train)
y_pred = ada.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred,average='macro')
precision = precision_score(y_test,y_pred,average='macro')
recall = recall_score(y_test,y_pred,average='macro')

print(f'Accuracy  {accuracy}')
print(f'F1        {f1}')
print(f'Precision {precision}')
print(f'Recall    {recall}')

Accuracy  0.7633262260127932
F1        0.7541728496499163
Precision 0.7593241709746564
Recall    0.7514540316411258


In [90]:
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred,average='macro')
precision = precision_score(y_test,y_pred,average='macro')
recall = recall_score(y_test,y_pred,average='macro')

print(f'Accuracy  {accuracy}')
print(f'F1        {f1}')
print(f'Precision {precision}')
print(f'Recall    {recall}')

Accuracy  0.7484008528784648
F1        0.7324544387680489
Precision 0.7505023181454836
Recall    0.7280591137364217


In [91]:
forest.fit(X_train,y_train)
y_pred = forest.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred,average='macro')
precision = precision_score(y_test,y_pred,average='macro')
recall = recall_score(y_test,y_pred,average='macro')

print(f'Accuracy  {accuracy}')
print(f'F1        {f1}')
print(f'Precision {precision}')
print(f'Recall    {recall}')

Accuracy  0.7452025586353944
F1        0.7270334112604714
Precision 0.7499107753329419
Recall    0.7225876465238892


In [73]:
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred,average='macro')
precision = precision_score(y_test,y_pred,average='macro')
recall = recall_score(y_test,y_pred,average='macro')

print(f'Accuracy  {accuracy}')
print(f'F1        {f1}')
print(f'Precision {precision}')
print(f'Recall    {recall}')

Accuracy  0.6433901918976546
F1        0.6229295275487643
Precision 0.6321339575624858
Recall    0.6222504392660153
