In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.feature_selection import SelectKBest, chi2, f_classif
import pandas as pd
import numpy as np

In [54]:
texts = load_files('data', encoding="ansi")
sample_length = 2000

def split_text(text, sample_length):
    samples = []
    i=0
    while i < len(text):
        while i > 0 and text[i].isalpha():
            i+=1
            samples[len(samples)-1] += text[i]
        samples.append(text[i:i+sample_length+1])
        i += sample_length
    del samples[0]
    return samples
data = []
labels = []
j = 0
for text in texts.data:
    new_records = split_text(text, sample_length)
    data += new_records
    for i in range(0,len(new_records)):
        labels.append(texts.target[j])
    j+=1
texts.data = data
texts.target = labels

In [69]:
# extract unigrams and bigrams
count_vect = TfidfVectorizer(stop_words='english',lowercase=True, ngram_range=(1,2), min_df=0.001, max_df=0.1)
vector = count_vect.fit_transform(texts.data)

In [70]:
df = pd.DataFrame(vector.todense(), columns=count_vect.get_feature_names())
df.shape

(11028, 18048)

In [91]:
msk = np.random.rand(len(df)) < 0.75
train_x = df[msk]
train_y = pd.Series(labels)[msk]
test_x = df[~msk]
test_y = pd.Series(labels)[~msk]
assert(len(train_x)==len(train_y))

In [72]:
# Retain 90% of the variance
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9).fit(train_x)

In [94]:
train_x_pca = pca.transform(train_x)
test_x_pca = pca.transform(test_x)
train_x_pca.shape

(8311, 4336)

In [75]:
selector = SelectKBest(f_classif, k=1000)
selector.fit_transform(train_x_pca, train_y)
selected_feature_ids = selector.get_support(indices=True)

In [96]:
len(selected_feature_ids)

1000

In [99]:
train_x_reduced = train_x_pca[:,selected_feature_ids]
test_x_reduced = test_x_pca[:,selected_feature_ids]
assert(len(train_x_reduced)==len(train_y))
assert(len(test_x_reduced)==len(test_y))

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(train_x_pca, train_y)
nb.score(test_x_pca, test_y)

In [117]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
lr.fit(train_x_reduced, train_y)
lr.score(test_x_reduced, test_y)

0.9227088700772911

In [124]:
from sklearn import svm
svc = svm.SVC()
svc.fit(train_x_reduced, train_y)
svc.score(test_x_reduced, test_y)

0.22046374677953626

In [129]:
lr.score(train_x_reduced, train_y)

0.9371916736854771

In [128]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=2500, max_depth=10)
rfc.fit(train_x_reduced, train_y)
rfc.score(test_x_reduced, test_y)

0.8601398601398601