In [1]:
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.svm
import sklearn.metrics as METRICS

def get_all_files_with_extension (folder_address, file_extension, process_sub_folders = True):
    all_files = []
    if process_sub_folders:
        for root, dirs, files in shutil.os.walk(folder_address):
            for file in files:
                if file.endswith("." + file_extension):
                    all_files.append(shutil.os.path.join(root, file))
        return (all_files)
    else:
        for file in shutil.os.listdir(folder_address):
            if file.endswith("." + file_extension): #".txt" ;
                all_files.append(folder_address + file)
        return (all_files)

def get_raw_data():
    data_input = {
     "train_pos": {"path" : "data/train/pos", "label" : 1},
     "train_neg": {"path" : "data/train/neg", "label" : 0},
     "test_pos" : {"path" : "data/test/pos" , "label" : 1},
     "test_neg" : {"path" : "data/test/neg" , "label" : 0},
    }

    train = []
    test  = []
    for key in data_input:
        all_files = get_all_files_with_extension(data_input[key]["path"] , "txt")
        label = data_input[key]["label"]
        for file_path in all_files:
            with open (file_path, "rt" , encoding="utf-8") as file_handle:
                file_content = file_handle.read()

            if "train" in key:
                train.append((file_content, label))
            else:
                test.append((file_content, label))
    return train, test

## Load data here

In [2]:
train , test = get_raw_data()

In [3]:
train[0]


('For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.',
 1)

In [4]:
train[-1]

('Not that I dislike childrens movies, but this was a tearjerker with few redeeming qualities. M.J. Fox was the perfect voice for Stuart and the rest of the talent was wasted. Hugh Laurie can be amazingly funny, but is not given the chance in this movie. It´s sugar-coated sugar and would hardly appeal to anyone over 7 years of age. See Toy Story, Monsters Inc. or Shrek instead. 3/10',
 0)

In [5]:
vectorizer = TfidfVectorizer(stop_words="english",
                               analyzer='word',
                               lowercase=True,
                               use_idf=True,
                               ngram_range=(1,3))

vectorizer.fit([x[0] for x in train])
feature_names = vectorizer.get_feature_names()

In [6]:
all_one_grams = [x for x in vectorizer.get_feature_names() if x.count(" ") == 0]
import random
random.shuffle(all_one_grams)
all_one_grams

['shrewsbury',
 'pollinating',
 'groundwork',
 'dragnet',
 'recollect',
 'agape',
 'haefengstal',
 'colorful',
 'auctioned',
 'std',
 'cauliflower',
 'tablet',
 'throwbacks',
 'studies',
 'sling',
 'cloaks',
 'meltingly',
 'reformer',
 'gardener',
 'stands',
 'bookman',
 'eliza',
 'headedness',
 'skewered',
 'quakers',
 'alveraze',
 'farkus',
 'moviestar',
 'freewheeling',
 'villified',
 'queries',
 'hairpin',
 'uproarious',
 'coppola',
 'spool',
 'sobs',
 'tupamaros',
 'flatness',
 'nightgown',
 'harry',
 'mcphee',
 'basinger',
 'foundational',
 'perish',
 'backstabbing',
 'presages',
 'draining',
 'churns',
 'improved',
 'clatch',
 'notting',
 'salina',
 'isaiah',
 'destroyed',
 'ecclestone',
 'cornering',
 'invergordon',
 '1854',
 'placings',
 'forevermore',
 'replete',
 'sandberg',
 'interprets',
 'consolidate',
 'elsewheres',
 'cyncial',
 'sandrelli',
 'colette',
 'ihave',
 'cowpies',
 'schmoeller',
 'porthos',
 'kuchler',
 'ecgtb',
 'imprisoned',
 'angkor',
 'glancing',
 'duchovn

In [7]:
train_x = vectorizer.transform([x[0] for x in train])
train_y = np.asarray([x[1] for x in train])

test_x = vectorizer.transform([x[0] for x in test])
test_y_true = np.asarray([x[1] for x in test])

In [8]:
for C_value_range in range(-5, +5):
    print("Training   ... C_value_range:" , C_value_range, " C:" , 2**C_value_range)
    classifier = sklearn.svm.LinearSVC(C=2**C_value_range, verbose=0)
    classifier.fit(train_x, train_y)
    print("Predicting ...")
    test_y_pred = classifier.predict(test_x)
    print ("accuracy :" , METRICS.accuracy_score(test_y_true , test_y_pred))
    print ("-"*80)

Training   ... C_value_range: -5  C: 0.03125
Predicting ...
accuracy : 0.84552
--------------------------------------------------------------------------------
Training   ... C_value_range: -4  C: 0.0625
Predicting ...
accuracy : 0.86084
--------------------------------------------------------------------------------
Training   ... C_value_range: -3  C: 0.125
Predicting ...
accuracy : 0.87008
--------------------------------------------------------------------------------
Training   ... C_value_range: -2  C: 0.25
Predicting ...
accuracy : 0.87636
--------------------------------------------------------------------------------
Training   ... C_value_range: -1  C: 0.5
Predicting ...
accuracy : 0.88172
--------------------------------------------------------------------------------
Training   ... C_value_range: 0  C: 1
Predicting ...
accuracy : 0.88492
--------------------------------------------------------------------------------
Training   ... C_value_range: 1  C: 2
Predicting ...
accu