In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import os

In [52]:
## getting the data
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [53]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [54]:
## checking for missing
df_train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [55]:
## checking the keyword column
df_train["keyword"].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [56]:
df_train[df_train["location"].notna()]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


In [57]:
df_train["target"].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [58]:
## checking the nature of the tweets

df_train["text"].sample(5).values

array(['No one told me you can drown yourself by drinking too much water.',
       'There are no four truths-of pain of desire that is the origin of pain of the obliteration of that desire of the pain to that obliteration.',
       'US Navy Sidelines 3 Newest Subs - http://t.co/guvTIzyCHE: DefenseNews.comUS Navy Sidelines 3 Newest SubsD... http://t.co/SY2WhXT0K5 #navy',
       'it sure made an impact on me http://t.co/GS50DdG1JY',
       'Germany has  39 gigawatts of installed solar capacity\r\n_One gwatt is about equal to the capacity of a nuclear reactor.\r\nhttp://t.co/leCZOlkmSV'],
      dtype=object)

In [59]:
## for cleaning function
## - remove urls
## - remove # tags
## - remove html special characters eg &amp;
## - remove @text
## - remove [01:04 UTC]
## - strip

In [60]:
import re

## function to preprocessing of text
def process_text(text):
    ## patterns to remove
    rem_pat_1 = "([#@]|https?:)\S*"
    rem_pat_2 = "&\S+;"
    rem_pat_3 = "\[\d+:\d+.+\]" ## removing timestamp. eg. [01:04 UTC]
    rem_pat_4 = "[\-_.+]" ## to remove symbols (make sure to bring last to avoid affecting first two patterns)
    combined_rem_pat = f"({rem_pat_1})|({rem_pat_2})|({rem_pat_3})|({rem_pat_4})"

    text = re.sub(combined_rem_pat, "", text) ## removing text that match patterns
    text = text.strip() ## removing trailing white spaces
    text = text.lower() ## lowercasing

    return text

## function for tokenizing of string
def tokenize(text):
    return re.split("\s+", text)

## function to remove stop words
def remove_stopwords(token_list):
    l = []
    for word in token_list:
        if word not in stopwords:
            l.append(word)
    return l

def full_text_process(text):
    text = process_text(text)
    # text = tokenize(text)
    # text = remove_stopwords(text)
    return text

In [61]:
## creating datatransformers
class GloveVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, filepath=".", encoding="utf-8"):
        self.filepath = filepath
        self.encoding = encoding

    def fitX(self, X):
        self.unique_words = set()
        self.vectors = []
        self.word2idx = {}

        ## getting unique words
        for text in X:
            self.unique_words |= set(text.split())

        ## builing word to index dictionary and vector matrix
        idx = 0
        with open(self.filepath, encoding=self.encoding) as  file:
            for line in file:
                line = line.split()
                word = line[0]
                if word in self.unique_words:
                    self.word2idx[word] = idx
                    self.vectors.append(line[1:])
                    idx+=1

        self.vectors = np.array(self.vectors, dtype=np.float32)

    def fitY(self, y):
        ## creating integer labels for each category
        unique = {x for x in y}
        self.label2int = dict([(v,i) for i,v in enumerate(unique)])


    def transformX(self, X):
        N = len(X)
        transformed_x = np.zeros((N, self.vectors.shape[1]), dtype=np.float32)
        for i in range(N):
            mat = []
            line = X[i].lower().split()
            for word in line:
                if word in self.word2idx:
                    mat.append(self.vectors[self.word2idx[word]])

            if len(mat) > 0: transformed_x[i] = np.mean(mat, axis=0)
            else: print(f"Sentence at index:{i} has no word in the vector dictionary")

        return np.array(transformed_x)

    def transformY(self, y):
        return np.array([self.label2int[word] for word in y])
        

    def fit(self, X, y=None):
        self.fitX(X)

        if y is not None: self.fitY(y)

        return self
    

    def transform(self, X, y=None):
        if y is not None: return self.transformX(X), self.transformY(y)
        else: return self.transformX(X)

    def fit_transform(self, X, y=None):
        self.fit(X,y)
        return self.transform(X,y)


In [62]:
## getting the training and testing processed data
## train
train_text = df_train["text"].apply(lambda x: full_text_process(x)).values
train_target = df_train["target"].values

In [63]:
random_state = 42
# X_train, X_test, y_train, y_test = train_test_split(train_text, train_target, test_size=0.3, shuffle=True, random_state=random_state)

In [64]:
glove_path = "../../../Machine_Learning_With_Python/NLP_LEARN/data/glove6B/glove.6B.50d.txt"
vectorizer = GloveVectorizer(glove_path, encoding="utf-8")

In [65]:
vectorizer.fit(train_text)

In [66]:
X = vectorizer.transform(train_text)
y = train_target

Sentence at index:24 has no word in the vector dictionary
Sentence at index:783 has no word in the vector dictionary
Sentence at index:2586 has no word in the vector dictionary
Sentence at index:2795 has no word in the vector dictionary
Sentence at index:3491 has no word in the vector dictionary
Sentence at index:3667 has no word in the vector dictionary
Sentence at index:3681 has no word in the vector dictionary
Sentence at index:3683 has no word in the vector dictionary
Sentence at index:4092 has no word in the vector dictionary
Sentence at index:4504 has no word in the vector dictionary
Sentence at index:5115 has no word in the vector dictionary
Sentence at index:5353 has no word in the vector dictionary
Sentence at index:5983 has no word in the vector dictionary
Sentence at index:5987 has no word in the vector dictionary
Sentence at index:5988 has no word in the vector dictionary
Sentence at index:5998 has no word in the vector dictionary
Sentence at index:6313 has no word in the v

### GridSearchCV

In [69]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [73]:
## random forest gridsearch
params = {
    "n_estimators": [200, 300],
    "max_depth": [ 8, 10],
}

grid = GridSearchCV(RandomForestClassifier(), params, cv=kf, scoring="accuracy", verbose=10)
grid.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 1/5; 1/4] END max_depth=8, n_estimators=200;, score=0.786 total time=  16.9s
[CV 2/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 2/5; 1/4] END max_depth=8, n_estimators=200;, score=0.798 total time=  16.6s
[CV 3/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 3/5; 1/4] END max_depth=8, n_estimators=200;, score=0.778 total time=  16.3s
[CV 4/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 4/5; 1/4] END max_depth=8, n_estimators=200;, score=0.780 total time=  16.4s
[CV 5/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 5/5; 1/4] END max_depth=8, n_estimators=200;, score=0.770 total time=  16.4s
[CV 1/5; 2/4] START max_depth=8, n_estimators=300...............................
[CV 1/5; 2/4] END max_depth=8, n_estimators=300;,

In [75]:
print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Score: 0.7894373871335967
Best Parameters: {'max_depth': 10, 'n_estimators': 300}


In [77]:
pred_text = df_test["text"].apply(lambda x: full_text_process(x)).values
pred_text = vectorizer.transform(pred_text)

Sentence at index:14 has no word in the vector dictionary
Sentence at index:748 has no word in the vector dictionary
Sentence at index:1433 has no word in the vector dictionary
Sentence at index:1554 has no word in the vector dictionary
Sentence at index:1733 has no word in the vector dictionary
Sentence at index:2379 has no word in the vector dictionary
Sentence at index:2562 has no word in the vector dictionary
Sentence at index:2567 has no word in the vector dictionary
Sentence at index:2569 has no word in the vector dictionary
Sentence at index:2571 has no word in the vector dictionary
Sentence at index:3015 has no word in the vector dictionary


In [78]:
preds = grid.predict(pred_text)

In [79]:
## creating submission dataframe
ids = df_test["id"].values

sub_df = pd.DataFrame({"id": ids, "target": preds.astype(int)})

## saving as csv
if not os.path.exists("x__submissions"):
    os.mkdir("x__submissions")

sub_df.to_csv("x__submissions/sub_glove_1.csv", index=False)