Loading.py

In [1]:
import pandas as pd
import os
from typing import List, Tuple

DIR = "data\twitter-datasets"


def _read_data(path: str) -> List[str]:
  with open(path, 'r') as f:
    return [x for x in f]


def _read_data_with_ids(path: str) -> Tuple[List[str], List[str]]:
  index = []
  rows = []
  with open(path, 'r') as f:
    for line in f:
      id, x = line.split(',', maxsplit=1)
      index.append(id)
      rows.append(x)
  return index, rows


def load_train(full=False) -> pd.DataFrame:
  print("LOADING DATA FROM ", DIR)
  pos_path = '/home/train_pos.txt'
  neg_path = '/home/train_neg.txt'

  #pos_path = os.path.join(DIR, 'train_pos' + ('_full' if full else '') + '.txt')
  #neg_path = os.path.join(DIR, 'train_neg' + ('_full' if full else '') + '.txt')

  pos_rows = _read_data(pos_path)
  pos = pd.DataFrame({'x': pos_rows})
  pos['y'] = 1

  neg_rows = _read_data(neg_path)
  neg = pd.DataFrame({'x': neg_rows})
  neg['y'] = 0

  return pd.concat([pos, neg], ignore_index=True).reset_index()


def load_test() -> pd.DataFrame:
  path = os.path.join(DIR, 'test_data.txt')
  index, rows = _read_data_with_ids(path)
  df = pd.DataFrame({'x': rows}, index)

  return df


Preprocessing.py

In [2]:
import pandas as pd


def remove_tags(df: pd.DataFrame):
  df['x'] = df['x'].apply(lambda x: x.replace('<user>', '').replace('<url>', '').strip())

Evaluate.py

In [3]:
from cmath import log
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score
from typing import Tuple
import logging


def _log_metrics(metrics):
  logging.info(
    '---\n' +
    '\n'.join([f'* {x}: {y}' for x,y in metrics.items()]) +
    '\n---'
  )


def evaluate_prob(y: np.array, y_pred: np.array, verbose=True) -> Tuple[float, float]:
  """
  Returns BCE loss, AUC in this order.
  """

  bce = log_loss(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  if verbose:
    _log_metrics({'bce': bce, 'auc': auc})

  return bce, auc


def evaluate(y: np.array, y_pred: np.array) -> Tuple[float, float, float, float, float, float]:
  """
  Returns accuracy, precision, recall, F1, BCE loss, AUC in this order.

  * accuracy: proportion of correctly classified answers
  * precision: proportion of correctly classified positives
  * recall: proportion of actual positives correctly classified
  * F1: combination of precision & recall
  """

  accuracy = accuracy_score(y, y_pred)
  precision = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)
  f1 = f1_score(y, y_pred)
  bce, auc = evaluate_prob(y, y_pred, verbose=False)
  _log_metrics({'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'bce': bce, 'auc': auc})

  return accuracy, precision, recall, f1, bce, auc



Loading and Preprocessing the dataset

In [4]:
import logging
import torch
print("done")

done


In [5]:
logging.basicConfig(level=logging.INFO)

In [12]:
df = load_train(full=False)
print("done loading")
df

LOADING DATA FROM  data	witter-datasets
done loading


Unnamed: 0,index,x,y
0,0,<user> i dunno justin read my mention or not ....,1
1,1,"because your logic is so dumb , i won't even c...",1
2,2,""" <user> just put casper in a box ! "" looved t...",1
3,3,<user> <user> thanks sir > > don't trip lil ma...,1
4,4,visiting my brother tmr is the bestest birthda...,1
...,...,...,...
199995,199995,can't wait to fake tan tonight ! hate being pa...,0
199996,199996,<user> darling i lost my internet connection ....,0
199997,199997,kanguru defender basic 4 gb usb 2.0 flash driv...,0
199998,199998,rizan is sad now\n,0


In [13]:
remove_tags(df)
df

Unnamed: 0,index,x,y
0,0,i dunno justin read my mention or not . only j...,1
1,1,"because your logic is so dumb , i won't even c...",1
2,2,""" just put casper in a box ! "" looved the bat...",1
3,3,thanks sir > > don't trip lil mama ... just ke...,1
4,4,visiting my brother tmr is the bestest birthda...,1
...,...,...,...
199995,199995,can't wait to fake tan tonight ! hate being pale,0
199996,199996,darling i lost my internet connection .. and i...,0
199997,199997,kanguru defender basic 4 gb usb 2.0 flash driv...,0
199998,199998,rizan is sad now,0


FastText Implementation: Installation

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/0.2.0.zip
!unzip 0.2.0.zip
%cd fastText-0.2.0
!make
print("done installing")

In [None]:
#OPTIONAL
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!pip install .

!pip show fastText



Adapt Dataset such that it works for training

In [14]:
from sklearn.model_selection import train_test_split

#split into train and test data
train, test = train_test_split(df, test_size=0.2)

#fastText can only recognize labels of the form __label__0 and __label__1, e.g convert "no text back ? yea , he mad" to "__label__0 no text back ? yea , he mad"
for index, row in train.iterrows():
    res = "__label__" + str(row['y']) + " " + row['x']
    train.at[index,'x'] = res

In [15]:
from google.colab import files


print(train['x'])
print(test['x'])
#write contents to files
f = open('trainCleaned.txt', 'w')
for index, row in train.iterrows():
    #print(row['x'])
    f.write(row['x']+"\n")

f.close()

f = open('ValCleaned.txt', 'w')
for index, row in test.iterrows():
    f.write(row['x']+"\n")

f.close()

files.download('trainCleaned.txt')
files.download('ValCleaned.txt')

186181    __label__0 amen : 100 chillout classics - the ...
56687         __label__1 i woke up in a really great mood !
193799                                __label__0 he said no
39028     __label__1 hey emeli . massive fan from aus ! ...
78116        __label__1 hey , here's how i gain followers :
                                ...                        
97972                  __label__1 ahhh who's on next then ?
179441    __label__0 just got home after such an exhaust...
122966    __label__0 i'm sad that i wasn't home to sleep...
27628                     __label__1 i can live with that .
12052                      __label__1 haha what can i say !
Name: x, Length: 160000, dtype: object
71137     i always wondered what a guy named bruno names...
13149         loool , i do this :/ what colour isittt ? : o
196451    citizens of humanity maternity : kelly - new p...
107089    my bloody valentine 3d ( two-disc special edit...
16397     haha oh yeah , i personally like when he gets ...
 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training

In [16]:
!./fasttext supervised -input trainCleaned.txt -output model -dim 2

Read 1M wordsRead 2M wordsRead 2M words
Number of words:  100149
Number of labels: 2
Progress: 100.0% words/sec/thread:  748658 lr:  0.000000 loss:  0.406744 ETA:   0h 0m



Predict & Evaluate

In [None]:
#Option 1
!cat ValCleaned.txt
!./fasttext predict model.bin ValCleaned.txt > predictions.txt

In [75]:
#convert predictions back to int
preds = pd.read_csv('predictions.txt', names=['Labels'], header=None)
preds.rename( columns={'Unnamed: 0':'Labels'}, inplace=True )
preds['Labels'] = preds['Labels'].str[-1].apply(np.int64)


evaluate(test['y'], preds['Labels'])


INFO:root:---
* accuracy: 0.825525
* precision: 0.8110605697869284
* recall: 0.848137392349289
* f1: 0.8291847174290818
* bce: 6.02623189178853
* auc: 0.8255566130909615
---


(0.825525,
 0.8110605697869284,
 0.848137392349289,
 0.8291847174290818,
 6.02623189178853,
 0.8255566130909615)

In [23]:
files.download('predictions.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>