# Count Vectorizer and Ridge Regression

This is just a copy of the "Getting Started" notebook.

In [1]:
from pathlib import Path
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
BASE_DIR = Path('/home/giovenko/Projects/real_or_not')
INPUT_DIR = BASE_DIR/'data/input'
OUTPUT_DIR = BASE_DIR/'data/output'

In [3]:
train_df = pd.read_csv(INPUT_DIR/'train.csv')
test_df = pd.read_csv(INPUT_DIR/'test.csv')

In [4]:
train_df['keyword'] = train_df.keyword.fillna('missing')
train_df['str_target'] = train_df.target.apply(lambda x: str(x))
train_df['keyword_target'] = train_df.keyword.str.cat(train_df.str_target)

test_df['keyword'] = test_df.keyword.fillna('missing')

In [5]:
# Pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer(lowercase=True, stop_words='english')),
    ('ridge', RidgeClassifier())
])

In [6]:
# Cross validation scheme stratified by keyword
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv = skf.split(train_df, train_df.keyword)
cv = skf.split(train_df, train_df.keyword_target)

In [7]:
# Classifier
scores = cross_val_score(
    pipe, train_df['text'], train_df["target"], cv=cv, scoring='f1')

print('score: {:.3f} +/- {:.3f}'.format(scores.mean(), 2*scores.std()))



score: 0.735 +/- 0.026


In [8]:
cv = skf.split(train_df, train_df.keyword_target)
y_pred = cross_val_predict(pipe, train_df.text, train_df.target, cv=cv)
confusion_matrix(train_df['target'], y_pred)



array([[3749,  593],
       [1023, 2248]])

In [9]:
pipe.fit(train_df.text, train_df.target)

Pipeline(steps=[('cvec', CountVectorizer(stop_words='english')),
                ('ridge', RidgeClassifier())])

In [10]:
# We reuse the sample submission replacing our own predictions
sample_submission = pd.read_csv(INPUT_DIR/'sample_submission.csv')
sample_submission['target'] = pipe.predict(test_df.text)
sample_submission.to_csv(OUTPUT_DIR/'submission_countvec_keyword_target.csv', index=False)