## Exploratory Data Analysis

In [1]:
# set paths
import os
os.chdir('../../')
print(os.getcwd())

/Users/chengyu/OneDrive/学习/Graduate Study/2021Winter/twitter-nlp


In [78]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [67]:
df_test = pd.read_csv('./data/test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Data Preprocessing

In [3]:
df_train['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [65]:
punctuation_pattern = r"[.!?#@÷>\+\-\*/':;(),\|\[\]_]+|[\x89Û|\x89Ó|\x89Ò|\x89|åê]+|http://t.co/[A-Za-z0-9]+|https://t.co/[A-Za-z0-9]+|\&gt|\&amp" 
df_train['text_clean'] = [(re.sub(punctuation_pattern, ' ', doc)).lower().strip() for doc in df_train['text']]

new_line = r'\n'
df_train['text_clean'] = [re.sub(new_line, ' ', doc) for doc in df_train['text_clean']]

multiple_spaces = r'[ ]+'
df_train['text_clean'] = [re.sub(multiple_spaces, ' ', doc) for doc in df_train['text_clean']]
df_train['text_clean']

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13 000 people receive wildfires evacuation ord...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    aria ahrary thetawniest the out of control wil...
7610              m1 94 01 04 utc 5km s of volcano hawaii
7611    police investigating after an e bike collided ...
7612    the latest more homes razed by northern califo...
Name: text_clean, Length: 7613, dtype: object

## Baseline Model

In [75]:
tfidf = TfidfVectorizer(max_df=0.995, min_df=0.005, stop_words='english')

X_train = tfidf.fit_transform(df_train['text_clean'])
y_train = df_train['target']

In [76]:
X_train.toarray().shape

(7613, 234)

In [98]:
lr = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
              'C': np.arange(0.005, 1, 0.005),
              'solver': ['lbfgs', 'saga'],
              'fit_intercept': [True],
              'max_iter': [100, 200, 300, 400, 500]}

grid_search = GridSearchCV(lr, param_grid=param_grid, n_jobs=-1, cv=5)

grid_search.fit(X_train, y_train)

4975 fits failed out of a total of 19900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4975 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chengyu/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chengyu/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/chengyu/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.005, 0.01 , 0.015, 0.02 , 0.025, 0.03 , 0.035, 0.04 , 0.045,
       0.05 , 0.055, 0.06 , 0.065, 0.07 , 0.075, 0.08 , 0.085, 0.09 ,
       0.095, 0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135,
       0.14 , 0.145, 0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 0.18 ,
       0.185, 0.19 , 0.195, 0.2  , 0.205, 0.21 , 0.215, 0.22 , 0.225,
       0.23 , 0.235, 0.24 , 0.245,...
       0.77 , 0.775, 0.78 , 0.785, 0.79 , 0.795, 0.8  , 0.805, 0.81 ,
       0.815, 0.82 , 0.825, 0.83 , 0.835, 0.84 , 0.845, 0.85 , 0.855,
       0.86 , 0.865, 0.87 , 0.875, 0.88 , 0.885, 0.89 , 0.895, 0.9  ,
       0.905, 0.91 , 0.915, 0.92 , 0.925, 0.93 , 0.935, 0.94 , 0.945,
       0.95 , 0.955, 0.96 , 0.965, 0.97 , 0.975, 0.98 , 0.985, 0.99 ,
       0.995]),
                         'fit_intercept': [True],
                         'max_iter': [100, 200, 300, 400, 500],
                         'penalty': ['l1

In [99]:
display(grid_search.best_params_)
display(grid_search.best_score_)

{'C': 0.34,
 'fit_intercept': True,
 'max_iter': 200,
 'penalty': 'l2',
 'solver': 'saga'}

0.6727997252811252