In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics

In [2]:
# Constants
TRAIN_DATA_NAME = 'sentiment140/sentiment140_training.csv'
TRAIN_COLUMN_NAMES = ['sentiment', 'tweets']

TEST_DATA_NAME = 'sentiment140/sentiment140_test.csv'
TEST_COLUMN_NAMES = ['sentiment', 'tweets']

ENCODING = 'latin-1'
NROWS = 1600000

pd.options.display.max_colwidth = None

In [3]:
train_df = pd.read_csv(
    TRAIN_DATA_NAME,
    encoding=ENCODING,
    header=0,
    names=TRAIN_COLUMN_NAMES,
    nrows=NROWS
    )

print('TRAIN DATASET: \n', train_df.head(4))


test_df = pd.read_csv(
    TEST_DATA_NAME,
    encoding=ENCODING,
    header=0,
    names=TEST_COLUMN_NAMES,
    nrows=NROWS
    )

print('\nTEST DATASET: \n', test_df.head(4))

TRAIN DATASET: 
    sentiment  \
0          0   
1          0   
2          0   
3          0   

                                                                                                                tweets  
0  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
1      is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!  
2                            @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds  
3                                                                      my whole body feels itchy and like its on fire   

TEST DATASET: 
    sentiment  \
0          4   
1          4   
2          4   
3          4   

                                                                                                                                         tweets  
0                               @stellargirl I looooo

# Hier noch Vergleich mit unbearbeitetem Datensatz

In [4]:
dataset_variations = [
    'cleaned_w_apostrophe_t_w_r_lem',
    'cleaned_w_apostrophe_t_wo_r_lem',
    'cleaned_wo_apostrophe_t_w_r_lem',
    'cleaned_wo_apostrophe_t_wo_r_lem', 
    'cleaned_w_apostrophe_t_w_r_stem',
    'cleaned_w_apostrophe_t_wo_r_stem', 
    'cleaned_wo_apostrophe_t_w_r_stem', 
    'cleaned_wo_apostrophe_t_wo_r_stem'
    ]

In [5]:
def load_train_and_test_dataset(dataset_name):

    train_filename = 'sentiment140/sentiment140_train_' + dataset_name + '.csv'
    test_filename = 'sentiment140/sentiment140_test_' + dataset_name + '.csv'


    train_df = pd.read_csv(
        train_filename,
        encoding=ENCODING,
        header=0,
        names=TRAIN_COLUMN_NAMES,
        nrows=NROWS
    )

    test_df = pd.read_csv(
        test_filename,
        encoding=ENCODING,
        header=0,
        names=TEST_COLUMN_NAMES,
        nrows=NROWS
    )

    return train_df, test_df

In [6]:
# Tfidf model

def tfidf_model(X_train, X_test):
    vectorizer = TfidfVectorizer(max_features=15000)
    Tfidfvectorizer = vectorizer.fit(X_train)

    X_train_tfidf = Tfidfvectorizer.transform(X_train)
    X_test_tfidf = Tfidfvectorizer.transform(X_test)

    return X_train_tfidf, X_test_tfidf

In [7]:
def train_predict_metrics(name, metrics_array, X_train_tfidf, y_train, X_test_tfidf, y_test):
    for neighbor in range(1,15,2):
        for p in [1,2]:
            
            # classifier
            clf_knn = KNeighborsClassifier(n_neighbors=neighbor, metric='minkowski', p=p).fit(X_train_tfidf, y_train)

            # test data predictions
            pred_X_test = clf_knn.predict(X_test_tfidf)

            # Test Dataset Metrics
            metrics_array.append([
                name, 
                neighbor, 
                p, 
                round(metrics.accuracy_score(y_test, pred_X_test), 4),
                round(metrics.precision_score(y_test, pred_X_test, pos_label=4), 4),
                round(metrics.recall_score(y_test, pred_X_test, pos_label=4), 4),
                round(metrics.f1_score(y_test, pred_X_test, pos_label=4), 4)
            ])




In [8]:
metrics_array = []

for dataset_name in dataset_variations:
    print('Dataset: ', dataset_name)

    train_df, test_df = load_train_and_test_dataset(dataset_name)

    X_train = train_df.tweets
    X_test = test_df.tweets
    y_train = train_df.sentiment
    y_test = test_df.sentiment

    X_train_tfidf, X_test_tfidf = tfidf_model(X_train, X_test)

    train_predict_metrics(dataset_name, metrics_array, X_train_tfidf, y_train, X_test_tfidf, y_test)

Dataset:  cleaned_w_apostrophe_t_w_r_lem
Dataset:  cleaned_w_apostrophe_t_wo_r_lem
Dataset:  cleaned_wo_apostrophe_t_w_r_lem
Dataset:  cleaned_wo_apostrophe_t_wo_r_lem
Dataset:  cleaned_w_apostrophe_t_w_r_stem
Dataset:  cleaned_w_apostrophe_t_wo_r_stem
Dataset:  cleaned_wo_apostrophe_t_w_r_stem
Dataset:  cleaned_wo_apostrophe_t_wo_r_stem


In [9]:
metrics_df = pd.DataFrame(metrics_array, columns=['dataset_name', 'neighbors', 'p', 'accuracy', 'precision', 'recall', 'f1'])
metrics_df

Unnamed: 0,dataset_name,neighbors,p,accuracy,precision,recall,f1
0,cleaned_w_apostrophe_t_w_r_lem,1,1,0.6407,0.7732,0.4121,0.5376
1,cleaned_w_apostrophe_t_w_r_lem,1,2,0.6490,0.7745,0.4341,0.5563
2,cleaned_w_apostrophe_t_w_r_lem,3,1,0.6323,0.7907,0.3736,0.5075
3,cleaned_w_apostrophe_t_w_r_lem,3,2,0.6212,0.7447,0.3846,0.5072
4,cleaned_w_apostrophe_t_w_r_lem,5,1,0.6156,0.7750,0.3407,0.4733
...,...,...,...,...,...,...,...
107,cleaned_wo_apostrophe_t_wo_r_stem,9,2,0.5543,0.5491,0.6758,0.6059
108,cleaned_wo_apostrophe_t_wo_r_stem,11,1,0.5794,0.8444,0.2088,0.3348
109,cleaned_wo_apostrophe_t_wo_r_stem,11,2,0.5655,0.5586,0.6813,0.6139
110,cleaned_wo_apostrophe_t_wo_r_stem,13,1,0.5682,0.8293,0.1868,0.3049


In [10]:
max_acc_value = metrics_df.accuracy.max()
metrics_df.iloc[metrics_df.accuracy.idxmax()]

dataset_name    cleaned_w_apostrophe_t_wo_r_lem
neighbors                                     1
p                                             2
accuracy                                 0.6546
precision                                0.7959
recall                                   0.4286
f1                                       0.5571
Name: 15, dtype: object