## Exploratory Data Analysis

In [1]:
# set paths
import os
os.chdir('../../')
print(os.getcwd())

import sys
sys.path.append('.')
from src.utils.submission import prediction_output
from src.utils.preprocessing import TextNormalizer

e:\OneDriveLocal\OneDrive\学习\Graduate Study\2021Winter\twitter-nlp


In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test = pd.read_csv('./data/test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Data Preprocessing

In [5]:
df_train['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [6]:
rules = {r"[.!?#@÷>\+\-\*/':;(),\|\[\]_]+|[\x89Û|\x89Ó|\x89Ò|\x89|åê]+|http://t.co/[A-Za-z0-9]+|https://t.co/[A-Za-z0-9]+|\&gt|\&amp": ' ',
         r'\n': ' ',
         r'[ ]+': ' '}

text_normalizer = TextNormalizer(rules=rules)

df_train['text_clean'] = text_normalizer.clean(df_train['text'])
df_test['text_clean'] = text_normalizer.clean(df_test['text'])

y_train = df_train['target']

## Baseline Model

In [7]:
tfidf = TfidfVectorizer(max_df=0.995, min_df=0.005, stop_words='english')

X_train = tfidf.fit_transform(df_train['text_clean'])
y_train = df_train['target']

In [8]:
X_train.toarray().shape

(7613, 234)

In [9]:
lr = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
              'C': np.arange(0.005, 1, 0.005),
              'solver': ['lbfgs', 'saga'],
              'fit_intercept': [True],
              'max_iter': [100, 200, 300, 400, 500]}

grid_search = GridSearchCV(lr, param_grid=param_grid, n_jobs=-1, cv=5)

grid_search.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.005, 0.01 , 0.015, 0.02 , 0.025, 0.03 , 0.035, 0.04 , 0.045,
       0.05 , 0.055, 0.06 , 0.065, 0.07 , 0.075, 0.08 , 0.085, 0.09 ,
       0.095, 0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135,
       0.14 , 0.145, 0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 0.18 ,
       0.185, 0.19 , 0.195, 0.2  , 0.205, 0.21 , 0.215, 0.22 , 0.225,
       0.23 , 0.235, 0.24 , 0.245,...
       0.77 , 0.775, 0.78 , 0.785, 0.79 , 0.795, 0.8  , 0.805, 0.81 ,
       0.815, 0.82 , 0.825, 0.83 , 0.835, 0.84 , 0.845, 0.85 , 0.855,
       0.86 , 0.865, 0.87 , 0.875, 0.88 , 0.885, 0.89 , 0.895, 0.9  ,
       0.905, 0.91 , 0.915, 0.92 , 0.925, 0.93 , 0.935, 0.94 , 0.945,
       0.95 , 0.955, 0.96 , 0.965, 0.97 , 0.975, 0.98 , 0.985, 0.99 ,
       0.995]),
                         'fit_intercept': [True],
                         'max_iter': [100, 200, 300, 400, 500],
                         'penalty': ['l1

In [10]:
print(grid_search.best_params_)
print(f"accuracy: {grid_search.best_score_}")

{'C': 0.34, 'fit_intercept': True, 'max_iter': 400, 'penalty': 'l2', 'solver': 'saga'}
accuracy: 0.6727997252811252


## Word Embedding

In [9]:
import spacy
from scipy import sparse

def vectorize_document(document):
    """sum up the word vectors in a document
    Args:
        document (spacy.tokens.doc.Doc): a parsed document
    Returns:
        vector (numpy.array): an array of shape (300,)
    """
    vector = np.zeros(300)
    for token in document:
        vector += token.vector

    return vector

In [10]:
nlp = spacy.load('en_core_web_lg')
df_train['text_preprocessed'] = [nlp(doc) for doc in df_train['text_clean']]
df_test['text_preprocessed'] = [nlp(doc) for doc in df_test['text_clean']]

In [11]:
X_train_vec = np.array([vectorize_document(doc) for doc in df_train['text_preprocessed']])
X_test_vec = np.array([vectorize_document(doc) for doc in df_test['text_preprocessed']])

In [16]:
X_train_vec.shape

(7613, 300)

In [60]:
X_test_vec.shape

(3263, 300)

In [31]:
lr = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.0081, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086, 0.0087, 0.0088, 0.0089, 0.009, 0.0091, 0.0092, 0.0093],
              'solver': ['lbfgs', 'saga'],
              'fit_intercept': [True],
              'max_iter': [500]}

grid_search = GridSearchCV(lr, param_grid=param_grid, n_jobs=-1, cv=5)

grid_search.fit(X_train_vec, y_train)

 0.78431583 0.78444715        nan 0.79561546 0.78418442 0.78431583
        nan 0.79587818 0.78418434 0.78418442        nan 0.7958781
 0.78418434 0.7840531         nan 0.79627205 0.78405302 0.7840531
        nan 0.79627214 0.78405302 0.7840531         nan 0.7966661
 0.78405302 0.78392178        nan 0.79732304 0.78405293 0.78379046
        nan 0.79797982 0.78365897 0.78365914        nan 0.79837386
 0.7839217  0.78365914        nan 0.79811122 0.7839217  0.78352774
        nan 0.79784858 0.7839217  0.78352774]


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0081, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086,
                               0.0087, 0.0088, 0.0089, 0.009, 0.0091, 0.0092,
                               0.0093],
                         'fit_intercept': [True], 'max_iter': [500],
                         'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'saga']})

In [33]:
grid_search.best_score_

0.7983738609822407

In [35]:
grid_search.best_params_

{'C': 0.0091,
 'fit_intercept': True,
 'max_iter': 500,
 'penalty': 'l1',
 'solver': 'saga'}

In [12]:
lr_best = LogisticRegression(C=0.0091, fit_intercept=True, max_iter=500, penalty='l1', solver='saga')
lr_best.fit(X_train_vec, y_train)
y_test_pred = lr_best.predict(X_test_vec)

In [13]:
prediction_output(y_test_pred, './submissions/submission.csv', id=df_test['id'])

In [51]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [40]:
rf = RandomForestClassifier()

rf_param_grid = {'n_estimators': [650, 660, 670, 680, 690, 700, 710, 720, 730, 740],
                 'max_depth': [5, 6, 7, 8, 9, 10]}

rf_grid_search = GridSearchCV(rf, param_grid=rf_param_grid, n_jobs=-1, cv=5)
rf_grid_search.fit(X_train_vec, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10],
                         'n_estimators': [650, 660, 670, 680, 690, 700, 710,
                                          720, 730, 740]})

In [41]:
rf_grid_search.best_score_

0.7874719047319119

In [42]:
rf_grid_search.best_params_

{'max_depth': 10, 'n_estimators': 680}

In [46]:
adaboost = AdaBoostClassifier()

adaboost_param_grid = {'n_estimators': [1000, 2000, 3000, 4000],
                       'learning_rate': [0.1]}

adaboost_grid_search = GridSearchCV(adaboost, adaboost_param_grid, n_jobs=-1, cv=5)

adaboost_grid_search.fit(X_train_vec, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1],
                         'n_estimators': [1000, 2000, 3000, 4000]})

In [48]:
adaboost_grid_search.best_score_

0.794038151756294

In [50]:
adaboost_grid_search.best_params_

{'learning_rate': 0.1, 'n_estimators': 1000}

In [53]:
gbt = GradientBoostingClassifier()

gbt_param_grid = {'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
                  'n_estimators': [100, 200, 300, 400, 500]}

gbt_grid_search = GridSearchCV(gbt, gbt_param_grid, n_jobs=-1, cv=5)

gbt_grid_search.fit(X_train_vec, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [54]:
gbt_grid_search.best_score_

0.791805198088357

In [55]:
gbt_grid_search.best_params_

{'learning_rate': 0.1, 'n_estimators': 100}