In [14]:
import pandas as pd
import numpy as np
import random
from scipy.sparse import diags
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime
mtrain = pd.read_csv("reviews_tr.csv", header=0, delimiter=",")

In [6]:
train = mtrain[0:20000]
data = train['text']
label = train['label']

In [3]:
def cv43(data,label):
    kf = KFold(n_splits=5)
    error = []
    for train, test in kf.split(label):
        vectorizer = TfidfVectorizer(min_df=1,token_pattern=r'\b\w+\b',smooth_idf=True)
        xtr = vectorizer.fit_transform(data[train]).astype(float)
        xte = vectorizer.transform(data[test]).astype(float)
        clf = tree.DecisionTreeClassifier()
        clf.fit(xtr, label[train])
        preds = clf.predict(xte)
        error.append(np.count_nonzero(preds-label[test])/float(len(preds)))
        print(error)
    return error

In [4]:
print(datetime.datetime.now())
error43 = cv43(data,label)
print(datetime.datetime.now())

2016-10-12 19:19:24.845000
[0.24975]
[0.24975, 0.2775]
[0.24975, 0.2775, 0.271]
[0.24975, 0.2775, 0.271, 0.26425]
[0.24975, 0.2775, 0.271, 0.26425, 0.25525]
2016-10-12 19:20:35.829000


In [3]:
def cv13(data,label):
    kf = KFold(n_splits=5)
    error = []
    for train, test in kf.split(label):
        vectorizer = CountVectorizer(min_df=1,token_pattern=r'\b\w+\b')
        xtr = vectorizer.fit_transform(data[train]).astype(float)
        xte = vectorizer.transform(data[test]).astype(float)
        clf = tree.DecisionTreeClassifier()
        clf.fit(xtr, label[train])
        preds = clf.predict(xte)
        error.append(np.count_nonzero(preds-label[test])/float(len(preds)))
        print(error)
    return error

In [4]:
print(datetime.datetime.now())
error13 = cv13(data,label)
print(datetime.datetime.now())

2016-10-12 00:38:12.916000
[0.23125]
[0.23125, 0.232775]
[0.23125, 0.232775, 0.23115]
[0.23125, 0.232775, 0.23115, 0.23455]
[0.23125, 0.232775, 0.23115, 0.23455, 0.231875]
2016-10-12 04:35:57.860000


In [15]:
def tfidf(data,testdata):
    vectorizer = CountVectorizer(min_df=1,token_pattern=r'\b\w+\b')
    tr = vectorizer.fit_transform(data).astype(float)
    nd = tr.shape[0] #number of documents
    dw = np.array((tr !=0 ).sum(axis = 0))[0] #number of doc contain w
    idf = diags(np.log10(nd)-np.log10(dw))
    idftr = tr.dot(idf)
    te = vectorizer.transform(testdata)
    idfte = te.dot(idf)
    return idftr, idfte

In [16]:
def cv23(data,label):
    kf = KFold(n_splits=5)
    error = []
    for train, test in kf.split(label):
        xtr, xte = tfidf(data[train],data[test])
        clf = tree.DecisionTreeClassifier()
        clf.fit(xtr, label[train])
        preds = clf.predict(xte)
        error.append(np.count_nonzero(preds-label[test])/float(len(preds)))
        print(error)
    return error

In [17]:
print(datetime.datetime.now())
error23 = cv23(data,label)
print(datetime.datetime.now())

2016-10-12 19:33:11.338000
[0.2435]
[0.2435, 0.26875]
[0.2435, 0.26875, 0.253]
[0.2435, 0.26875, 0.253, 0.2435]
[0.2435, 0.26875, 0.253, 0.2435, 0.26675]
2016-10-12 19:34:15.477000


In [18]:
def cv33(data,label):
    kf = KFold(n_splits=5)
    error = []
    for train, test in kf.split(label):
        vectorizer = CountVectorizer(ngram_range=(2, 2),min_df=1,token_pattern=r'\b\w+\b')
        xtr = vectorizer.fit_transform(data[train]).astype(float)
        xte = vectorizer.transform(data[test]).astype(float)
        clf = tree.DecisionTreeClassifier()
        clf.fit(xtr, label[train])
        preds = clf.predict(xte)
        error.append(np.count_nonzero(preds-label[test])/float(len(preds)))
        print(error)
    return error

In [19]:
print(datetime.datetime.now())
error33 = cv33(data,label)
print(datetime.datetime.now())

2016-10-12 19:36:11.468000
[0.254]
[0.254, 0.2715]
[0.254, 0.2715, 0.26275]
[0.254, 0.2715, 0.26275, 0.2695]
[0.254, 0.2715, 0.26275, 0.2695, 0.26]
2016-10-12 19:42:54.763000
