In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


'''This program reads in the train and test data, adds bigrams, POS-tags and performs TF-IDF, then trains on the data and predicts on the test set using an SVM. It outputs the most informative features and an evaluation matrix.'''



import numpy as np
import time
import nltk
# following data needs to be downloaded at first iteration only
nltk.download('averaged_perceptron_tagger')
import string
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import spacy


# paths
train_file_OD = '/content/gdrive/MyDrive/lfd-project-data/trainOD.csv'

train_file_ID = '/content/gdrive/MyDrive/lfd-project-data/trainID.csv'
dev_file_ID = '/content/gdrive/MyDrive/lfd-project-data/devID.csv'
test_file_ID = '/content/gdrive/MyDrive/lfd-project-data/testID.csv'

test_trump = '/content/gdrive/MyDrive/lfd-project-data/testtrump.csv'
test_police = '/content/gdrive/MyDrive/lfd-project-data/testpolice.csv'
test_education = '/content/gdrive/MyDrive/lfd-project-data/testeducation.csv'
test_immigration = '/content/gdrive/MyDrive/lfd-project-data/testimmigration.csv'
test_economy = '/content/gdrive/MyDrive/lfd-project-data/testeconomy.csv'


def read_corpus(corpus_file, pos_tag):
    '''splits line in tokens, appends the text of each review to a list.'''
    
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8', errors='ignore') as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            # remove give-away features
            new_line = line[0].replace('timescontent.com', '').replace('MATP', '').replace('Reprint', '').replace('â€', '' ).replace('â€¢', '').replace('Â', '').replace('™', '').replace('Herald', '')
            
            tokens = new_line.strip().split()
            
            # tokenize and then filter punctuation
            tokens = nltk.word_tokenize(line)
            tokens = list(filter(lambda token: token not in string.punctuation, tokens))

            if pos_tag:
                #POS-tags
                pos = nltk_pos(tokens)
                for tag in pos:
                    tokens.append(tag)
            documents.append(tokens)
            labels.append(line[-1])
    return documents, labels


def identity(x):
    '''Dummy function that just returns the input'''
    return x


def plot_coefficients(classifier, feature_names, top_features=40):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    print(classifier.classes_)
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.show()


def nltk_pos(txt):
    pos = nltk.pos_tag(txt)
    return [token[1] for token in pos]

def grid_search():
    # Grid search
    param_grid = {'C': [0.1,1, 10], 'gamma': ['scale','auto'],'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
    grid = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2)
    grid_classifier = Pipeline([('vec', vec), ('cls', grid)])
    grid_classifier.fit(X_train,Y_train)
    print(grid.best_params_)
    grid_predictions = grid_classifier.predict(X_test)
    print(classification_report(Y_test,grid_predictions))


if __name__ == "__main__":

    #edit these parameters according to preference.
    gridsearch = False
    postags = False
    min_ngram = 1
    max_ngram = 1

    # reads in the full input texts with their corresponding labels. Change files to test different datasets
    X_train, Y_train = read_corpus(train_file_OD, postags)
    X_test, Y_test = read_corpus(test_file_ID, postags)

    if gridsearch:
        grid_search()
    else:
        # Convert the texts to vectors
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity, ngram_range=(min_ngram,max_ngram))
        vec.fit(X_train)
    
        # Combine the vectorizer with a Support Vector Machine classifier
        X_train = vec.transform(X_train)
        clf = svm.LinearSVC()    
        pipe = Pipeline([('vec', vec), ('cls', clf)])
        # trains the classifier with the training features and labels, measures time it takes to train
        classifier = clf.fit(X_train, Y_train)
        plot_coefficients(classifier, vec.get_feature_names())

        # lets the trained classifier predict labels for the test set
        Y_pred = pipe.predict(X_test)
    
        # calculate metrics
        print(classification_report(Y_test, Y_pred, digits=3)) 
