In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from xgboost import XGBClassifier

import os
import random

In [157]:
## getting the data
data_path = "x__data"
df_train = pd.read_csv(f"{data_path}/train.csv")
df_test = pd.read_csv(f"{data_path}/test.csv")
sample_submission = pd.read_csv(f"{data_path}/samplesubmission.csv")

In [142]:
df_train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [143]:
## checking for missing
df_train.isna().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [144]:
## checking the keyword column
df_train["agreement"].value_counts()

agreement
1.000000    5866
0.666667    3894
0.333333     239
Name: count, dtype: int64

In [145]:
## looking at the number of samples per class
df_train["label"].value_counts()

label
 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: count, dtype: int64

#### Choosing to train only only columns that have above `60%` agreement

In [146]:
df_train = df_train[df_train["agreement"] > 0.6]

In [147]:
df_train["safe_text"].value_counts().shape

(9426,)

In [148]:
df_train.shape

(9760, 4)

##### Removing the duplicates

In [149]:
df_train = df_train.drop_duplicates(subset="safe_text")

In [150]:
## sanity check
df_train.shape

(9426, 4)

##### Looking at a random text sample

In [151]:
df_train.sample(n=5)["safe_text"].values

array(['The Indiana State Department of Health has issued some immunization changes for the 2014-15... <url> <user>',
       'Every cause needs a champion - Dr Lucas Otieno. We see this every day in our amazing health workers. #vaccineswork <user>',
       '#SuperStarSundays thepalace_nj #MMR @ The Palace <url>',
       'Measles continues to spread in Orange County <url>',
       '“<user> Georgia confirms its first #measles case since 2012 - an infant from outside the country <url> nopeee'],
      dtype=object)

In [152]:
## for cleaning function
## - remove urls
## - remove # tags
## - remove html special characters eg &amp;
## - remove @text
## - remove [01:04 UTC]
## - strip

In [153]:
import re

## function to preprocessing of text
def process_text(text):
    ## patterns to remove
    rem_pat_1 = "([@]|https?:)\S*"
    rem_pat_2 = "&\S+;"
    rem_pat_3 = "\[\d+:\d+.+\]" ## removing timestamp. eg. [01:04 UTC]
    rem_pat_4 = "[\-_.+#]" ## to remove symbols (make sure to bring last to avoid affecting first two patterns)
    rem_pat_5 = "<.*>"
    combined_rem_pat = f"({rem_pat_1})|({rem_pat_2})|({rem_pat_3})|({rem_pat_4})|({rem_pat_5})"

    text = re.sub(combined_rem_pat, "", text) ## removing text that match patterns
    text = text.strip() ## removing trailing white spaces
    text = text.lower() ## lowercasing

    return text

## function for tokenizing of string
def tokenize(text):
    return re.split("\s+", text)

## function to remove stop words
def remove_stopwords(token_list):
    l = []
    for word in token_list:
        if word not in stopwords:
            l.append(word)
    return l

def full_text_process(text):
    text = process_text(text)
    # text = tokenize(text)
    # text = remove_stopwords(text)
    return text

In [154]:
## creating datatransformers
class GloveVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, filepath=".", encoding="utf-8"):
        self.filepath = filepath
        self.encoding = encoding

    def fitX(self, X):
        self.unique_words = set()
        self.vectors = []
        self.word2idx = {}

        ## getting unique words
        for text in X:
            self.unique_words |= set(text.split())

        ## builing word to index dictionary and vector matrix
        idx = 0
        with open(self.filepath, encoding=self.encoding) as  file:
            for line in file:
                line = line.split()
                word = line[0]
                if word in self.unique_words:
                    self.word2idx[word] = idx
                    self.vectors.append(line[1:])
                    idx+=1

        self.vectors = np.array(self.vectors, dtype=np.float32)

    def fitY(self, y):
        ## creating integer labels for each category
        unique = {x for x in y}
        self.label2int = dict([(v,i) for i,v in enumerate(unique)])


    def transformX(self, X):
        N = len(X)
        transformed_x = np.zeros((N, self.vectors.shape[1]), dtype=np.float32)
        for i in range(N):
            mat = []
            line = X[i].lower().split()
            for word in line:
                if word in self.word2idx:
                    mat.append(self.vectors[self.word2idx[word]])

            if len(mat) > 0: transformed_x[i] = np.mean(mat, axis=0)
            else: print(f"Sentence at index:{i} has no word in the vector dictionary")

        return np.array(transformed_x)

    def transformY(self, y):
        return np.array([self.label2int[word] for word in y])
        

    def fit(self, X, y=None):
        self.fitX(X)

        if y is not None: self.fitY(y)

        return self
    

    def transform(self, X, y=None):
        if y is not None: return self.transformX(X), self.transformY(y)
        else: return self.transformX(X)

    def fit_transform(self, X, y=None):
        self.fit(X,y)
        return self.transform(X,y)


In [155]:
## getting the training and testing processed data
## train
train_text = df_train["safe_text"].apply(lambda x: full_text_process(x)).values
train_target = df_train["label"].values

In [92]:
random.choice(train_text)

'an nonimmunized child must be treated differently by docneed to recognize that deadly vaccine preventable diseases are possible vaccines'

In [93]:
random_state = 42
# X_train, X_test, y_train, y_test = train_test_split(train_text, train_target, test_size=0.3, shuffle=True, random_state=random_state)

In [94]:
glove_path = "../../../Machine_Learning_With_Python/NLP_LEARN/data/glove6B/glove.6B.50d.txt"
vectorizer = GloveVectorizer(glove_path, encoding="utf-8")

In [95]:
vectorizer.fit(train_text)

In [96]:
X = vectorizer.transform(train_text)
y = train_target

Sentence at index:17 has no word in the vector dictionary
Sentence at index:42 has no word in the vector dictionary
Sentence at index:71 has no word in the vector dictionary
Sentence at index:109 has no word in the vector dictionary
Sentence at index:112 has no word in the vector dictionary
Sentence at index:153 has no word in the vector dictionary
Sentence at index:175 has no word in the vector dictionary
Sentence at index:234 has no word in the vector dictionary
Sentence at index:252 has no word in the vector dictionary
Sentence at index:266 has no word in the vector dictionary
Sentence at index:366 has no word in the vector dictionary
Sentence at index:426 has no word in the vector dictionary
Sentence at index:441 has no word in the vector dictionary
Sentence at index:480 has no word in the vector dictionary
Sentence at index:483 has no word in the vector dictionary
Sentence at index:674 has no word in the vector dictionary
Sentence at index:722 has no word in the vector dictionary


### GridSearchCV

In [97]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

#### RandomForestClassifier

In [104]:
params = {
    "n_estimators": [200, 300],
    "max_depth": [ 8, 10],
}

grid = GridSearchCV(RandomForestRegressor(), params, cv=kf, scoring="neg_root_mean_squared_error", verbose=10)
grid.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 1/5; 1/4] END max_depth=8, n_estimators=200;, score=-0.591 total time= 2.1min
[CV 2/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 2/5; 1/4] END max_depth=8, n_estimators=200;, score=-0.590 total time= 2.3min
[CV 3/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 3/5; 1/4] END max_depth=8, n_estimators=200;, score=-0.584 total time= 2.2min
[CV 4/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 4/5; 1/4] END max_depth=8, n_estimators=200;, score=-0.582 total time= 2.1min
[CV 5/5; 1/4] START max_depth=8, n_estimators=200...............................
[CV 5/5; 1/4] END max_depth=8, n_estimators=200;, score=-0.593 total time= 2.2min
[CV 1/5; 2/4] START max_depth=8, n_estimators=300...............................
[CV 1/5; 2/4] END max_depth=8, n_estimators=

In [105]:
print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Score: -0.5849319372968924
Best Parameters: {'max_depth': 10, 'n_estimators': 300}


### ExtraTreesClassifier

In [169]:
params = {
    "n_estimators": [600, 700],
    "max_depth": [16, 18],
}

grid_ex = GridSearchCV(ExtraTreesClassifier(), params, cv=kf, scoring="neg_root_mean_squared_error", verbose=10)
grid_ex.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=16, n_estimators=600..............................
[CV 1/5; 1/4] END max_depth=16, n_estimators=600;, score=-0.674 total time=  20.7s
[CV 2/5; 1/4] START max_depth=16, n_estimators=600..............................
[CV 2/5; 1/4] END max_depth=16, n_estimators=600;, score=-0.699 total time=  19.0s
[CV 3/5; 1/4] START max_depth=16, n_estimators=600..............................
[CV 3/5; 1/4] END max_depth=16, n_estimators=600;, score=-0.673 total time=  18.8s
[CV 4/5; 1/4] START max_depth=16, n_estimators=600..............................
[CV 4/5; 1/4] END max_depth=16, n_estimators=600;, score=-0.674 total time=  18.5s
[CV 5/5; 1/4] START max_depth=16, n_estimators=600..............................
[CV 5/5; 1/4] END max_depth=16, n_estimators=600;, score=-0.693 total time=  17.8s
[CV 1/5; 2/4] START max_depth=16, n_estimators=700..............................
[CV 1/5; 2/4] END max_depth=16, n_estim

In [33]:
print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Score: 0.7998154448262861
Best Parameters: {'max_depth': 18, 'n_estimators': 600}


#### XGBClassifier

In [113]:
params = {
    "n_estimators": [200, 300],
    "max_depth": [6, 8, 10],
    "learning_rate": [0.05, 0.1]
}

grid = GridSearchCV(XGBClassifier(), params, cv=kf, scoring="neg_root_mean_squared_error", verbose=10)
grid.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START learning_rate=0.05, max_depth=6, n_estimators=200..........
[CV 1/5; 1/12] END learning_rate=0.05, max_depth=6, n_estimators=200;, score=0.805 total time=   3.1s
[CV 2/5; 1/12] START learning_rate=0.05, max_depth=6, n_estimators=200..........
[CV 2/5; 1/12] END learning_rate=0.05, max_depth=6, n_estimators=200;, score=0.809 total time=   3.0s
[CV 3/5; 1/12] START learning_rate=0.05, max_depth=6, n_estimators=200..........
[CV 3/5; 1/12] END learning_rate=0.05, max_depth=6, n_estimators=200;, score=0.798 total time=   2.6s
[CV 4/5; 1/12] START learning_rate=0.05, max_depth=6, n_estimators=200..........
[CV 4/5; 1/12] END learning_rate=0.05, max_depth=6, n_estimators=200;, score=0.792 total time=   3.3s
[CV 5/5; 1/12] START learning_rate=0.05, max_depth=6, n_estimators=200..........
[CV 5/5; 1/12] END learning_rate=0.05, max_depth=6, n_estimators=200;, score=0.787 total time=   2.8s
[CV 1/5; 2/12] START lea

In [114]:
print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Score: 0.7983689429621839
Best Parameters: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 200}


### Prediction

In [160]:
## filling nulls
df_test = df_test.fillna("")

In [125]:
ids = df_test["tweet_id"]

In [126]:
def predict_on_dataframe(model, data):
    '''Make a prediction on a dataframe'''
    pred_text = data["safe_text"].apply(lambda x: full_text_process(x)).values
    pred_text = vectorizer.transform(pred_text)
    return grid.predict(pred_text)

In [127]:
# pred_text = df_test["text"].apply(lambda x: full_text_process(x)).values
# pred_text = vectorizer.transform(pred_text)
preds = predict_on_dataframe(grid, df_test)

Sentence at index:11 has no word in the vector dictionary
Sentence at index:65 has no word in the vector dictionary
Sentence at index:74 has no word in the vector dictionary
Sentence at index:80 has no word in the vector dictionary
Sentence at index:93 has no word in the vector dictionary
Sentence at index:116 has no word in the vector dictionary
Sentence at index:204 has no word in the vector dictionary
Sentence at index:217 has no word in the vector dictionary
Sentence at index:225 has no word in the vector dictionary
Sentence at index:235 has no word in the vector dictionary
Sentence at index:241 has no word in the vector dictionary
Sentence at index:311 has no word in the vector dictionary
Sentence at index:377 has no word in the vector dictionary
Sentence at index:397 has no word in the vector dictionary
Sentence at index:550 has no word in the vector dictionary
Sentence at index:663 has no word in the vector dictionary
Sentence at index:693 has no word in the vector dictionary
Se

### Submitting

In [165]:
submission = pd.DataFrame({"tweet_id": ids, "target":np.clip(preds, -1, 1)})

if not os.path.exists("x__submissions"):
    os.mkdir("x__submissions")

save_name = "glove_rfr.csv"
submission.to_csv(f"x__submissions/{save_name}", index=False)

In [130]:
# ## creating submission dataframe
# ids = df_test["id"].values

# sub_df = pd.DataFrame({"id": ids, "target": preds.astype(int)})

# ## saving as csv
# if not os.path.exists("x__submissions"):
#     os.mkdir("x__submissions")

# sub_df.to_csv("x__submissions/sub_glove_1.csv", index=False)

### Saving Necessary Objects

In [29]:
import pickle

In [135]:
## creating transformer
class TweetTransformer():
    def __init__(self, word2idx, vectors):
        self.word2idx = word2idx
        self.vectors = vectors

    ## function to preprocessing of text
    def process_text(self, text):
        ## patterns to remove
        rem_pat_1 = "([@]|https?:)\S*"
        rem_pat_2 = "&\S+;"
        rem_pat_3 = "\[\d+:\d+.+\]" ## removing timestamp. eg. [01:04 UTC]
        rem_pat_4 = "[\-_.+#]" ## to remove symbols (make sure to bring last to avoid affecting first two patterns)
        combined_rem_pat = f"({rem_pat_1})|({rem_pat_2})|({rem_pat_3})|({rem_pat_4})"
    
        text = re.sub(combined_rem_pat, "", text) ## removing text that match patterns
        text = text.strip() ## removing trailing white spaces
        text = text.lower() ## lowercasing
    
        return text
    
    
    def full_text_process(self, text):
        text = self.process_text(text)
        return text


    def transform(self, X):
        
        if isinstance(X, str):
            X = [self.full_text_process(X)]
            
        N = len(X)
        X = [self.full_text_process(x) for x in X]
            
        transformed_x = np.zeros((N, self.vectors.shape[1]), dtype=np.float32)
        for i in range(N):
            mat = []
            line = X[i].lower().split()
            for word in line:
                if word in self.word2idx:
                    mat.append(self.vectors[self.word2idx[word]])

            if len(mat) > 0: transformed_x[i] = np.mean(mat, axis=0)
            else: print(f"Sentence at index:{i} has no word in the vector dictionary")

        return np.array(transformed_x)


In [27]:
estimator = ExtraTreesClassifier(**{'max_depth': 16, 'n_estimators': 500})
estimator.fit(X,y)

In [136]:
## creating transformer
transformer = TweetTransformer(vectorizer.word2idx, vectorizer.vectors)

In [30]:
save_folder = "x__serialized"
if not os.path.exists(save_folder):
    os.mkdir(save_folder)

## saving model
with open(os.path.join(save_folder, "disaster_tweets_classifier.pickle"), "wb") as file:
    pickle.dump(estimator, file)

# ## saving transformer
# with open(os.path.join(save_folder, "disaster_tweets_transformer.pickle"), "wb") as file:
#     pickle.dump(transformer, file)

## saving word2idx and vectors
with open(os.path.join(save_folder, "disaster_tweets_word_vectors.pickle"), "wb") as file:
    pickle.dump({"word2idx": vectorizer.word2idx, "vectors": vectorizer.vectors}, file)

In [122]:
estimator.predict(transformer.transform(train_text))[:30]

Sentence at index:24 has no word in the vector dictionary
Sentence at index:783 has no word in the vector dictionary
Sentence at index:2586 has no word in the vector dictionary
Sentence at index:3667 has no word in the vector dictionary
Sentence at index:3681 has no word in the vector dictionary
Sentence at index:3683 has no word in the vector dictionary
Sentence at index:4092 has no word in the vector dictionary
Sentence at index:4504 has no word in the vector dictionary
Sentence at index:5115 has no word in the vector dictionary
Sentence at index:5353 has no word in the vector dictionary
Sentence at index:5983 has no word in the vector dictionary
Sentence at index:5987 has no word in the vector dictionary
Sentence at index:5988 has no word in the vector dictionary
Sentence at index:5998 has no word in the vector dictionary
Sentence at index:6313 has no word in the vector dictionary
Sentence at index:6522 has no word in the vector dictionary
Sentence at index:6705 has no word in the v

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])