In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel
import fasttext

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train_test_split import select_train_with_cluster, select_eval_with_cluster
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT, prepare_dataset, compute_metrics

[nltk_data] Downloading package stopwords to /home/jonhue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonhue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonhue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df_cluster_map_train = pd.read_csv('clustering+bert/train.csv')[['index', 'cluster']]
df_cluster_map_eval = pd.read_csv('clustering+bert/eval.csv')[['index', 'cluster']]
df_cluster_map = pd.concat([df_cluster_map_train, df_cluster_map_eval])
df_cluster_map

Unnamed: 0,index,cluster
0,157049.0,1
1,2366208.0,2
2,1948945.0,0
3,1684769.0,5
4,2262152.0,1
...,...,...
1249995,1478680.0,2
1249996,1972646.0,4
1249997,1710597.0,5
1249998,1835784.0,4


In [None]:
CLUSTER = 5

In [None]:
df_train = select_train_with_cluster(df_cluster_map, CLUSTER, size=None)
df_train['type'] = 'train'
df_train

Unnamed: 0,index,text,label,type,cluster
3,1684769,<user> <user> i'll say it again ( about ko i l...,0,train,5
6,1550874,<user> yeah i know and thanks <user>\n,0,train,5
9,1764436,mother and daughter relationships are so beaut...,0,train,5
22,1647395,"<user> so you're not saying "" ta ta "" back ?\n",0,train,5
24,623491,<user> i where got so bad one ? ! ? i very nic...,1,train,5
...,...,...,...,...,...
1249985,1867868,"<user> that true , also happened to me , peopl...",0,train,5
1249989,1068566,<user> thank you so much for the pet food mat ...,1,train,5
1249991,2219493,<user> why he is so sexi ? ! i want to cry whe...,0,train,5
1249994,2253091,<user> sweet yes .. but still poor\n,0,train,5


In [None]:
df_eval = select_eval_with_cluster(df_cluster_map, CLUSTER, size=None)
df_eval['type'] = 'eval'
df_eval

Unnamed: 0,index,text,label,type,cluster
5,1977399,"<user> i fell asleep , and now i'm so sleepy b...",0,eval,5
8,311419,"<user> oh good luck , but i am sure you do not...",1,eval,5
13,2306570,<user> i know come watch it with me instead of...,0,eval,5
15,1657606,i called my mommy crying but she didnt answer\n,0,eval,5
20,526876,i don't get into the birthday tweet . but 9:02...,1,eval,5
...,...,...,...,...,...
1249988,1857871,i still ain seen think like a man < < < ... im...,0,eval,5
1249990,960284,<user> yeah yeah <3\n,1,eval,5
1249992,681281,treat me like a joke & i'll leave you likes it...,1,eval,5
1249994,1218627,<user> good friday night to you ! how r u ? i'...,1,eval,5


In [None]:
df = pd.concat([df_train, df_eval])
df['x'] = df['text']
df['x'] = df['x'].apply(lambda x: x.replace('\n', ''))
df

Unnamed: 0,index,text,label,type,cluster,x
3,1684769,<user> <user> i'll say it again ( about ko i l...,0,train,5,<user> <user> i'll say it again ( about ko i l...
6,1550874,<user> yeah i know and thanks <user>\n,0,train,5,<user> yeah i know and thanks <user>
9,1764436,mother and daughter relationships are so beaut...,0,train,5,mother and daughter relationships are so beaut...
22,1647395,"<user> so you're not saying "" ta ta "" back ?\n",0,train,5,"<user> so you're not saying "" ta ta "" back ?"
24,623491,<user> i where got so bad one ? ! ? i very nic...,1,train,5,<user> i where got so bad one ? ! ? i very nic...
...,...,...,...,...,...,...
1249988,1857871,i still ain seen think like a man < < < ... im...,0,eval,5,i still ain seen think like a man < < < ... im...
1249990,960284,<user> yeah yeah <3\n,1,eval,5,<user> yeah yeah <3
1249992,681281,treat me like a joke & i'll leave you likes it...,1,eval,5,treat me like a joke & i'll leave you likes it...
1249994,1218627,<user> good friday night to you ! how r u ? i'...,1,eval,5,<user> good friday night to you ! how r u ? i'...


In [None]:
PREPROCESSING = {'remove_tags': True}

preprocess(df, flags=PREPROCESSING)
df

Unnamed: 0,index,text,label,type,cluster,x
3,1684769,<user> <user> i'll say it again ( about ko i l...,0,train,5,i'll say it again ( about ko i love that man ....
6,1550874,<user> yeah i know and thanks <user>\n,0,train,5,yeah i know and thanks
9,1764436,mother and daughter relationships are so beaut...,0,train,5,mother and daughter relationships are so beaut...
22,1647395,"<user> so you're not saying "" ta ta "" back ?\n",0,train,5,"so you're not saying "" ta ta "" back ?"
24,623491,<user> i where got so bad one ? ! ? i very nic...,1,train,5,i where got so bad one ? ! ? i very nice one e...
...,...,...,...,...,...,...
1249988,1857871,i still ain seen think like a man < < < ... im...,0,eval,5,i still ain seen think like a man < < < ... im...
1249990,960284,<user> yeah yeah <3\n,1,eval,5,yeah yeah <3
1249992,681281,treat me like a joke & i'll leave you likes it...,1,eval,5,treat me like a joke & i'll leave you likes it...
1249994,1218627,<user> good friday night to you ! how r u ? i'...,1,eval,5,good friday night to you ! how r u ? i'm havin...


In [None]:
try:
    os.remove('tweets.txt')
except OSError:
    pass

f = open('tweets.txt', 'w')
for index, row in df.iterrows():
    f.write(row['x']+"\n")
f.close()

In [None]:
model = fasttext.train_unsupervised('tweets.txt', model='skipgram', dim=100)

In [None]:
df['vec'] = df['x'].apply(model.get_sentence_vector)

In [None]:
df

Unnamed: 0,index,text,label,type,cluster,x,vec
3,1684769,<user> <user> i'll say it again ( about ko i l...,0,train,5,i'll say it again ( about ko i love that man ....,"[0.12042853, 0.019682324, -0.03543307, -0.0314..."
6,1550874,<user> yeah i know and thanks <user>\n,0,train,5,yeah i know and thanks,"[0.0779454, 0.056956816, -0.009026276, -0.0828..."
9,1764436,mother and daughter relationships are so beaut...,0,train,5,mother and daughter relationships are so beaut...,"[0.09517352, 0.046535388, -0.028901964, -0.046..."
22,1647395,"<user> so you're not saying "" ta ta "" back ?\n",0,train,5,"so you're not saying "" ta ta "" back ?","[0.11969137, -0.014625407, 0.010730671, -0.022..."
24,623491,<user> i where got so bad one ? ! ? i very nic...,1,train,5,i where got so bad one ? ! ? i very nice one e...,"[0.09957587, 0.002426933, -0.05698955, -0.0698..."
...,...,...,...,...,...,...,...
1249988,1857871,i still ain seen think like a man < < < ... im...,0,eval,5,i still ain seen think like a man < < < ... im...,"[0.10276047, 0.021412222, -0.071788505, -0.071..."
1249990,960284,<user> yeah yeah <3\n,1,eval,5,yeah yeah <3,"[0.14361914, 0.043967705, -0.08308176, -0.1322..."
1249992,681281,treat me like a joke & i'll leave you likes it...,1,eval,5,treat me like a joke & i'll leave you likes it...,"[0.09123084, -0.039600827, -0.023292731, -0.04..."
1249994,1218627,<user> good friday night to you ! how r u ? i'...,1,eval,5,good friday night to you ! how r u ? i'm havin...,"[0.103570715, -0.0122932745, -0.041420713, -0...."


In [None]:
df_train = df[df['type'] == 'train'].reset_index()
df_eval = df[df['type'] == 'eval'].reset_index()

In [None]:
np.vstack(df_train['vec']).shape

(329028, 100)

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(verbosity=3, max_depth=8, n_estimators=10000, learning_rate=0.1, base_score=0.55,  tree_method='gpu_hist', predictor='gpu_predictor')

In [None]:
xgb_model = xgb_model.fit(np.vstack(df_train['vec']), df_train['label'])

In [None]:
y_predict = xgb_model.predict_proba(np.vstack(df_eval['vec']))

In [None]:
y_predict.shape

(329491, 2)

In [None]:
compute_metrics((y_predict, df_eval['label']))

{'accuracy': 0.7825312375755332,
 'auc': 0.8628598047172129,
 'bce': 0.5174705837449127,
 'confidence': 0.694219,
 'confidence_std': 0.058600802,
 'correct_confidence': 0.7033805,
 'correct_confidence_std': 0.05123694,
 'f1': 0.8054434772220019,
 'incorrect_confidence': 0.6612521,
 'incorrect_confidence_std': 0.070396595,
 'precision': 0.789272030651341,
 'recall': 0.8222914610753213}

In [None]:
compute_metrics((xgb_model.predict_proba(np.vstack(df_train['vec'])), df_train['label']))

{'accuracy': 0.9977904616020521,
 'auc': 0.9999873976019338,
 'bce': 0.315979740465936,
 'confidence': 0.7296772,
 'confidence_std': 0.012573124,
 'correct_confidence': 0.73002845,
 'correct_confidence_std': 0.009654792,
 'f1': 0.9979727223800853,
 'incorrect_confidence': 0.57112324,
 'incorrect_confidence_std': 0.06524211,
 'precision': 0.9979309806371018,
 'recall': 0.998014467615186}