## This section is where I upload all of my cleaned tweets and turn them into tsv files for BERT/pytorch hugging face classifying.

In [1]:
import pandas as pd
import string
import nltk
from numpy.random import RandomState
wn = nltk.WordNetLemmatizer()
import re
prefix = 'data/'

In [2]:
#load data
df=pd.read_csv('data/cleaned_tweets.csv')
#test_df = pd.read_csv(prefix + 'test.csv', header=None)

In [18]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,text,label,text_noURLs,text_punct,text_removeRT,tokenized,nonstop,Tweet_lemmatized,stemmed,remove_less_freq,remove_less_freq_lemma,document,document_lemma,document_stemmed
0,0,The free flu jab is also available for people ...,0,The free flu jab is also available for people ...,The free flu jab is also available for people ...,The free flu jab also available for people wi...,"['the', 'free', 'flu', 'jab', 'also', 'availab...","['free', 'flu', 'jab', 'also', 'available', 'p...","['free', 'flu', 'jab', 'also', 'available', 'p...","['free', 'flu', 'jab', 'also', 'avail', 'peopl...","['free', 'jab', 'also', 'avail', 'peopl', 'cer...","['free', 'jab', 'also', 'certain']",free jab also avail peopl certain medic condit...,free flu jab also available people certain med...,free flu jab also avail peopl certain medic co...
1,1,RT @OUBarstool: “Baker deserved a national cha...,0,RT : “Baker deserved a national championship m...,RT “Baker deserved a national championship mo...,“Baker deserved national championship more ...,"['', 'baker', 'deserved', 'national', 'champio...","['', 'baker', 'deserved', 'national', 'champio...","['', 'baker', 'deserved', 'national', 'champio...","['', 'baker', 'deserv', 'nation', 'championshi...","['deserv', 'nation', 'anyon', 'averag']",[],deserv nation anyon averag,baker deserved national championship anyone p...,baker deserv nation championship anyon planet...


In [4]:
# I'm using tweets with just simple cleaning. 
new= pd.DataFrame(df[['text_removeRT', 'label']]).reset_index(drop=True)

In [5]:
# I know there's nan's in label section since I intentionally left it blank to try to achieve a more balance class
#while manually labeling the data
new.label.dropna(inplace=True)

In [6]:
new.label.value_counts()

0    3427
1     896
Name: label, dtype: int64

In [7]:
new.head()

Unnamed: 0,text_removeRT,label
0,The free flu jab also available for people wi...,0
1,“Baker deserved national championship more ...,0
2,Got flu jab today now the office trying ...,0
3,Hummm ive been with headache for the last tw...,1
4,Throw the whole boyfriend away,0


In [8]:
#shuffling data
new = new.sample(frac=1).reset_index(drop=True)

In [9]:
# I want to split this data to get a test and train set
rng = RandomState()

df_train = new.sample(frac=0.70, random_state=rng)
df_test = new.loc[~new.index.isin(df_train.index)]

In [10]:
train= pd.DataFrame(df_train[['text_removeRT', 'label']]).reset_index(drop=True)

In [11]:
print(df_train.shape)
print(df_test.shape)

(3026, 2)
(1297, 2)


## Converting files that BERT understand. It requires an ID, label, alpha and text columns. 

In [12]:
train_df = pd.DataFrame({
    'id':range(len(train)),
    'label':train.iloc[:,-1],
    'alpha':['a']*train.shape[0],
    'text': train.iloc[:,-2].replace(r'\n', ' ', regex=True)
})

train_df.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,Would you
1,1,0,a,amp ’ sick 😂
2,2,0,a,Being bed ridden with for days has given way...
3,3,1,a,love when flu turns bronchitis really can’...
4,4,0,a,yeah sure use antibiotics cure viral diseases...


In [13]:
train_df.label.value_counts()

0    2389
1     637
Name: label, dtype: int64

In [14]:
test= pd.DataFrame(df_test[['text_removeRT', 'label']]).reset_index()

In [15]:
dev_df = pd.DataFrame({
    'id':range(len(test)),
    'label':test.iloc[:,-1],
    'alpha':['a']*test.shape[0],
    'text': test.iloc[:,-2].replace(r'\n', ' ', regex=True)
})

dev_df.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,either have the flu food poisoning
1,1,0,a,bang
2,2,0,a,this disgusting faking flu shot Especially ...
3,3,1,a,omg have the flu too
4,4,0,a,‚¶


In [16]:
dev_df.label.value_counts()

0    1038
1     259
Name: label, dtype: int64

## Saving data as TSV for BERT/XLNet to be able to read data. 

In [17]:
train_df.to_csv('data/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv('data/dev.tsv', sep='\t', index=False, header=False)