In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

In [2]:
# Constants
TRAIN_DATA_NAME = 'sentiment140/sentiment140_training.csv'
TRAIN_COLUMN_NAMES = ['sentiment', 'tweets']

TEST_DATA_NAME = 'sentiment140/sentiment140_test.csv'
TEST_COLUMN_NAMES = ['sentiment', 'tweets']

ENCODING = 'latin-1'
NROWS = 1600000

pd.options.display.max_colwidth = None

In [3]:
train_df = pd.read_csv(
    TRAIN_DATA_NAME,
    encoding=ENCODING,
    header=0,
    names=TRAIN_COLUMN_NAMES,
    nrows=NROWS
    )

print('TRAIN DATASET: \n', train_df.head(4))


test_df = pd.read_csv(
    TEST_DATA_NAME,
    encoding=ENCODING,
    header=0,
    names=TEST_COLUMN_NAMES,
    nrows=NROWS
    )

print('\nTEST DATASET: \n', test_df.head(4))

TRAIN DATASET: 
    sentiment  \
0          0   
1          0   
2          0   
3          0   

                                                                                                                tweets  
0  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
1      is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!  
2                            @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds  
3                                                                      my whole body feels itchy and like its on fire   

TEST DATASET: 
    sentiment  \
0          4   
1          4   
2          4   
3          4   

                                                                                                                                         tweets  
0                               @stellargirl I looooo

In [4]:
dataset_variations = [
    'cleaned_w_apostrophe_t_w_r_lem',
    'cleaned_w_apostrophe_t_wo_r_lem',
    'cleaned_wo_apostrophe_t_w_r_lem',
    'cleaned_wo_apostrophe_t_wo_r_lem', 
    'cleaned_w_apostrophe_t_w_r_stem',
    'cleaned_w_apostrophe_t_wo_r_stem', 
    'cleaned_wo_apostrophe_t_w_r_stem', 
    'cleaned_wo_apostrophe_t_wo_r_stem'
    ]

In [5]:
# Bag of Words model
vectorizer = CountVectorizer(max_features=10000)
BoWvectorizer = vectorizer.fit(train_df.tweets)

X_train_bow = BoWvectorizer.transform(train_df.tweets)
X_test_bow = BoWvectorizer.transform(test_df.tweets)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\Anne\miniconda3\envs\ds\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Anne\AppData\Local\Temp\ipykernel_21352\685721583.py", line 3, in <module>
    BoWvectorizer = vectorizer.fit(train_df.tweets)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Anne\miniconda3\envs\ds\Lib\site-packages\sklearn\feature_extraction\text.py", line 1323, in fit
    self.fit_transform(raw_documents)
  File "c:\Users\Anne\miniconda3\envs\ds\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Anne\miniconda3\envs\ds\Lib\site-packages\sklearn\feature_extraction\text.py", line 1372, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^

In [45]:
clf_knn1 = KNeighborsClassifier(n_neighbors=1, metric='minkowski', p=1).fit(X_train_bow, train_df.sentiment)
clf_knn3 = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=1).fit(X_train_bow, train_df.sentiment)
clf_knn5 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1).fit(X_train_bow, train_df.sentiment)
clf_knn7 = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=1).fit(X_train_bow, train_df.sentiment)

In [46]:
pred_X_test1 = clf_knn1.predict(X_test_bow)
pred_X_test3 = clf_knn3.predict(X_test_bow)
pred_X_test5 = clf_knn5.predict(X_test_bow)
pred_X_test7 = clf_knn7.predict(X_test_bow)

In [47]:
print('classification report test\n', metrics.classification_report(test_df.sentiment, pred_X_test1))
print('test accuracy', metrics.accuracy_score(test_df.sentiment, pred_X_test1))

print('test accuracy1', metrics.accuracy_score(test_df.sentiment, pred_X_test1))
print('test accuracy3', metrics.accuracy_score(test_df.sentiment, pred_X_test3))
print('test accuracy5', metrics.accuracy_score(test_df.sentiment, pred_X_test5))
print('test accuracy7', metrics.accuracy_score(test_df.sentiment, pred_X_test7))

classification report test
               precision    recall  f1-score   support

           0       0.63      0.75      0.68       177
           4       0.70      0.57      0.63       182

    accuracy                           0.66       359
   macro avg       0.66      0.66      0.65       359
weighted avg       0.66      0.66      0.65       359

test accuracy 0.6573816155988857
test accuracy1 0.6573816155988857
test accuracy3 0.6740947075208914
test accuracy5 0.6657381615598886
test accuracy7 0.6824512534818942


train und test datensatz laden
train test split machen
boW
clf trainieren
predicten


# GridSearch
train und test Datensaetze laden   
bow  
gridsearch:  
mit den besten parametern am testdatensatz testen: accuracy

In [6]:
def load_train_and_test_dataset(dataset_name):

    train_filename = 'sentiment140/sentiment140_train_' + dataset_name + '.csv'
    test_filename = 'sentiment140/sentiment140_test_' + dataset_name + '.csv'


    train_df = pd.read_csv(
        train_filename,
        encoding=ENCODING,
        header=0,
        names=TRAIN_COLUMN_NAMES,
        nrows=NROWS
    )

    test_df = pd.read_csv(
        test_filename,
        encoding=ENCODING,
        header=0,
        names=TEST_COLUMN_NAMES,
        nrows=NROWS
    )

    return train_df, test_df

In [7]:
# Bag of Words model

def bow_model(X_train, X_test):
    vectorizer = CountVectorizer(max_features=15000)
    BoWvectorizer = vectorizer.fit(X_train)

    X_train_bow = BoWvectorizer.transform(X_train)
    X_test_bow = BoWvectorizer.transform(X_test)

    return X_train_bow, X_test_bow

In [8]:
def train_predict_metrics(name, metrics_array, X_train_bow, y_train, X_test_bow, y_test):
    for neighbor in range(1,15,2):
        for p in [1,2]:
            
            # classifier
            clf_knn = KNeighborsClassifier(n_neighbors=neighbor, metric='minkowski', p=p).fit(X_train_bow, y_train)

            # test data predictions
            pred_X_test = clf_knn.predict(X_test_bow)

            # Test Dataset Metrics
            metrics_array.append([
                name, 
                neighbor, 
                p, 
                round(metrics.accuracy_score(y_test, pred_X_test), 4),
                round(metrics.precision_score(y_test, pred_X_test, pos_label=4), 4),
                round(metrics.recall_score(y_test, pred_X_test, pos_label=4), 4),
                round(metrics.f1_score(y_test, pred_X_test, pos_label=4), 4)
            ])




In [9]:
metrics_array = []

for dataset_name in dataset_variations:
    print('Dataset: ', dataset_name)

    train_df, test_df = load_train_and_test_dataset(dataset_name)

    X_train = train_df.tweets
    X_test = test_df.tweets
    y_train = train_df.sentiment
    y_test = test_df.sentiment

    X_train_bow, X_test_bow = bow_model(X_train, X_test)

    train_predict_metrics(dataset_name, metrics_array, X_train_bow, y_train, X_test_bow, y_test)

Dataset:  cleaned_w_apostrophe_t_w_r_lem
Dataset:  cleaned_w_apostrophe_t_wo_r_lem
Dataset:  cleaned_wo_apostrophe_t_w_r_lem
Dataset:  cleaned_wo_apostrophe_t_wo_r_lem
Dataset:  cleaned_w_apostrophe_t_w_r_stem
Dataset:  cleaned_w_apostrophe_t_wo_r_stem
Dataset:  cleaned_wo_apostrophe_t_w_r_stem
Dataset:  cleaned_wo_apostrophe_t_wo_r_stem


In [10]:
metrics_array

[['cleaned_w_apostrophe_t_w_r_lem', 1, 1, 0.6713, 0.7963, 0.4725, 0.5931],
 ['cleaned_w_apostrophe_t_w_r_lem', 1, 2, 0.6657, 0.7818, 0.4725, 0.589],
 ['cleaned_w_apostrophe_t_w_r_lem', 3, 1, 0.6741, 0.8095, 0.467, 0.5923],
 ['cleaned_w_apostrophe_t_w_r_lem', 3, 2, 0.663, 0.7905, 0.456, 0.5784],
 ['cleaned_w_apostrophe_t_w_r_lem', 5, 1, 0.7019, 0.6596, 0.8516, 0.7434],
 ['cleaned_w_apostrophe_t_w_r_lem', 5, 2, 0.7019, 0.6638, 0.8352, 0.7397],
 ['cleaned_w_apostrophe_t_w_r_lem', 7, 1, 0.7075, 0.6652, 0.8516, 0.747],
 ['cleaned_w_apostrophe_t_w_r_lem', 7, 2, 0.7103, 0.6741, 0.8297, 0.7438],
 ['cleaned_w_apostrophe_t_w_r_lem', 9, 1, 0.7019, 0.6596, 0.8516, 0.7434],
 ['cleaned_w_apostrophe_t_w_r_lem', 9, 2, 0.7131, 0.6725, 0.8462, 0.7494],
 ['cleaned_w_apostrophe_t_w_r_lem', 11, 1, 0.7019, 0.6569, 0.8626, 0.7458],
 ['cleaned_w_apostrophe_t_w_r_lem', 11, 2, 0.7103, 0.6741, 0.8297, 0.7438],
 ['cleaned_w_apostrophe_t_w_r_lem', 13, 1, 0.7103, 0.6598, 0.8846, 0.7559],
 ['cleaned_w_apostrophe_t_w

In [11]:
metrics_df = pd.DataFrame(metrics_array, columns=['dataset_name', 'neighbors', 'p', 'accuracy', 'precision', 'recall', 'f1'])
metrics_df

Unnamed: 0,dataset_name,neighbors,p,accuracy,precision,recall,f1
0,cleaned_w_apostrophe_t_w_r_lem,1,1,0.6713,0.7963,0.4725,0.5931
1,cleaned_w_apostrophe_t_w_r_lem,1,2,0.6657,0.7818,0.4725,0.5890
2,cleaned_w_apostrophe_t_w_r_lem,3,1,0.6741,0.8095,0.4670,0.5923
3,cleaned_w_apostrophe_t_w_r_lem,3,2,0.6630,0.7905,0.4560,0.5784
4,cleaned_w_apostrophe_t_w_r_lem,5,1,0.7019,0.6596,0.8516,0.7434
...,...,...,...,...,...,...,...
107,cleaned_wo_apostrophe_t_wo_r_stem,9,2,0.6964,0.6466,0.8846,0.7471
108,cleaned_wo_apostrophe_t_wo_r_stem,11,1,0.7047,0.6532,0.8901,0.7535
109,cleaned_wo_apostrophe_t_wo_r_stem,11,2,0.6852,0.6375,0.8791,0.7390
110,cleaned_wo_apostrophe_t_wo_r_stem,13,1,0.7047,0.6532,0.8901,0.7535


In [12]:
max_acc_value = metrics_df.accuracy.max()
metrics_df.iloc[metrics_df.accuracy.idxmax()]

dataset_name    cleaned_w_apostrophe_t_w_r_stem
neighbors                                     5
p                                             1
accuracy                                 0.7549
precision                                0.7238
recall                                   0.8352
f1                                       0.7755
Name: 60, dtype: object

In [38]:
PARAM_GRID = {
                'n_neighbors': [1],
                'p': [1],
                'weights': ['uniform'],
                'metric': ['minkowski']
              }


grid = GridSearchCV(
                    estimator=KNeighborsClassifier(),
                    param_grid=PARAM_GRID,
                    scoring='accuracy',
                    n_jobs=-1,
                    refit=True,
                    cv=2,
                    verbose=10
                    )

grid.fit(X_train_bow[720000:880000], y_train[720000:880000])


Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [39]:
# Results
print(grid.best_params_)
print(grid.best_score_)

{'metric': 'minkowski', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
0.61444375
