# Tf-Idf + Stratification w.r.t. keyword and target on cleaned data

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import utils

In [2]:
# BASE_DIR = Path('/home/giovenko/Projects/real_or_not')
HOME_DIR = Path.home()
BASE_DIR = Path(HOME_DIR/'Projects/real_or_not')
INPUT_DIR = BASE_DIR/'data/input'
OUTPUT_DIR = BASE_DIR/'data/output'

In [3]:
train_df = pd.read_csv(INPUT_DIR/'train.csv')
test_df = pd.read_csv(INPUT_DIR/'test.csv')

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
clean_train_df = utils.clean_df(train_df)
clean_test_df = utils.clean_df(test_df)

In [6]:
clean_train_df.head()

Unnamed: 0,id,keyword,location,text,target,str_target,keyword_target,clean_text
0,1,missing,,Our Deeds are the Reason of this #earthquake M...,1,1,missing1,our deeds are the reason of this earthquake ma...
1,4,missing,,Forest fire near La Ronge Sask. Canada,1,1,missing1,forest fire near la ronge sask canada
2,5,missing,,All residents asked to 'shelter in place' are ...,1,1,missing1,all residents asked to shelter in place are be...
3,6,missing,,"13,000 people receive #wildfires evacuation or...",1,1,missing1,13 000 people receive wildfires evacuation ord...
4,7,missing,,Just got sent this photo from Ruby #Alaska as ...,1,1,missing1,just got sent this photo from ruby alaska as s...


In [7]:
pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                 ('ridge', RidgeClassifier(alpha=1))])

The scores from `cross_val_score` and from `cross_validate` should be the same.

In [8]:
# Cross validation scheme stratified by keyword
skf = StratifiedKFold(n_splits=5) #, shuffle=True, random_state=42)
cv = skf.split(clean_train_df, clean_train_df.keyword_target)

In [9]:
scores1 = cross_val_score(pipe, clean_train_df['clean_text'],
                         clean_train_df['target'],
                         scoring='f1', cv=cv)
print(scores1)



[0.74252492 0.76144578 0.73449921 0.76213592 0.73786408]


In [10]:
cv = skf.split(clean_train_df, clean_train_df.keyword_target)
scores2 = cross_validate(pipe, X=clean_train_df.clean_text, y=clean_train_df.target,
                        scoring='f1', cv=cv, return_train_score=True)
pd.DataFrame(scores2)



Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.177387,0.036448,0.742525,0.952362
1,0.126214,0.035056,0.761446,0.945254
2,0.129942,0.037005,0.734499,0.948425
3,0.12192,0.035452,0.762136,0.948773
4,0.119603,0.034814,0.737864,0.952548


We are overfitting like crazy. Next step is controlling overfitting.

In [11]:
pipe.fit(clean_train_df['clean_text'], clean_train_df['target'])
y_pred = pipe.predict(clean_test_df['clean_text'])

In [12]:
# sample_submission = pd.read_csv(INPUT_DIR/'sample_submission.csv')
# sample_submission['target'] = y_pred
# sample_submission.to_csv(OUTPUT_DIR/'submission_tfidf_keyword_target.csv', index=False)