In [7]:
import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
import string
from joblib import dump, load

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
punc = string.punctuation
df['Original Name'] = df['Name']
df['Name'] = df['Name'].astype(str)
df['Name'] = df['Name'].apply(lambda x: x.lower())
df['Name'] = df['Name'].apply(lambda x: ''.join(s for s in x if s not in punc))

In [4]:
train, test = train_test_split(df, test_size=0.2)

# Baseline Model
- Idea is to do as little as possible to have a starting point from which to work from

In [5]:
%%time
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(np.array(train['Original Name']))
y = train['Class']
X_test = vectorizer.transform(np.array(test['Original Name']))
y_test = test['Class']
clf = RidgeClassifier().fit(X, y)
# Train accuracy
print('Train acc: {}'.format(clf.score(X, y)))
# Test accuracy
print('Test acc: {}'.format(clf.score(X_test, y_test)))

Train acc: 0.9192991806919361
Test acc: 0.7120665464020846
CPU times: user 9min 50s, sys: 28min 21s, total: 38min 12s
Wall time: 2min 28s


In [20]:
dump(clf, '../models/base_model.joblib')
dump(vectorizer, "../models/vectorizer.joblib")

['../models/vectorizer.joblib']

# Do the text changes help?

In [7]:
%%time
vectorizer = CountVectorizer(strip_accents='unicode')
X = vectorizer.fit_transform(np.array(train['Name']))
y = train['Class']
X_test = vectorizer.transform(np.array(test['Name']))
y_test = test['Class']
clf = RidgeClassifier().fit(X, y)
# Train accuracy
print('Train acc: {}'.format(clf.score(X, y)))
# Test accuracy
print('Test acc: {}'.format(clf.score(X_test, y_test)))

Train acc: 0.9229504201316997
Test acc: 0.7070919659614789
CPU times: user 6min 8s, sys: 17min 51s, total: 24min
Wall time: 1min 34s


# TF-IDF
- Does this make much of an impact on short texts

In [10]:
%%time
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(np.array(train['Name']))
y = train['Class']
X_test = vectorizer.transform(np.array(test['Name']))
y_test = test['Class']
clf = RidgeClassifier().fit(X, y)
# Train accuracy
print('Train acc: {}'.format(clf.score(X, y)))
# Test accuracy
print('Test acc: {}'.format(clf.score(X_test, y_test)))

Train acc: 0.9220712071412139
Test acc: 0.705433772481277
CPU times: user 3min, sys: 8min 33s, total: 11min 33s
Wall time: 49.4 s


# Use df freq to ignore more common words?

In [16]:
%%time
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(np.array(train['Name']))
y = train['Class']
X_test = vectorizer.transform(np.array(test['Name']))
y_test = test['Class']
clf = RidgeClassifier().fit(X, y)
# Train accuracy
print('Train acc: {}'.format(clf.score(X, y)))
# Test accuracy
print('Test acc: {}'.format(clf.score(X_test, y_test)))

Train acc: 0.6503807265579267
Test acc: 0.6314892764080978
CPU times: user 12.2 s, sys: 318 ms, total: 12.5 s
Wall time: 12.3 s
