In [0]:
grading = False

In [0]:
from copy import deepcopy

import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV

In [0]:
def load_questions():
  """
  Task 1a)

  Load the questions and their classes.

  Complete the following steps:
    - load the file "train_5500.label.txt" with "ISO-8859-1" encoding
    - read the lines of the file
    - consider only the lines that start with "LOC:city" or "HUM:ind"
    - create a list in the following format:
      [ (text, class), ... ]
    - "text" is the content of the line AFTER the label
       LOC:city What is the name of the city that Maurizio Pellegrin lives in ?
      For this line the content would be: "What is the name of the city that Maurizio Pellegrin lives in ?"
    - class is based on the label: "CITY" for "LOC:city" and "HUMAN" for "HUM:ind"
      For the example above this would be "CITY"
    - keep only the first 128 lines with class "HUMAN" (labelled as "HUM:ind")
    - keep all samples for "CITY"
    - return the list in the format described above
  
  :return: A list of questions in the format [ (text, class), ... ]
  """
  texts = []
  ### YOUR CODE HERE ###
  return texts

In [0]:
if not grading:
    from collections import Counter
    texts = load_questions()
    count = Counter(x[1] for x in texts)
    print(count)
    # Counter({'CITY': 129, 'HUMAN': 128})

In [0]:
def build_dataframe_q(texts):
  """
  Task 1b)

  Process the texts and create a DataFrame you can use for training a classifier.

  Complete the following steps:
    - load the spacy model "en_core_web_sm"
    - for each text, create a spacy doc object
    - for each token in this doc object, do:
      - if the token begins an entity, keep the entity type
      - elif the token is outside an entity, and not a stopword, keep token.text
      - if the token is a stopword, continue
      - keep token.text
        (if the token is outside an entity then this means you keep token.text twice)
      - you can find everything you need here: https://spacy.io/api/token
      - create a dictionary for the current document in this format:
        {
          "text": list of the tokens joined with whitespace (use " ".join(tokens)),
          "class": the class of the text,
        }
      - store this dictionary, for each document, in a list
      - create a new dataframe where each row is represented by one of the
       document dictionaries
      - return this dataframe
  
  :param texts: A list of texts as returned by the "load_questions()" method.
  :return: A DataFrame with columns "text", "class", where "text"
    contains the preprocessed text from the list of texts, "class" the label of 
    the text.
  """
  rows = []
  ### YOUR CODE HERE ###
  df = pd.DataFrame(rows)
  return df

In [0]:
if not grading:
    df = build_dataframe_q(texts)
    print(df.iloc[10])
    # text        city city oldest oldest relationship relations...
    # class                                                    CITY
    # Name: 10, dtype: object
    print("----")
    print(df.iloc[100])
    # text        FAC McCarren Airport located located city city...
    # class                                                    CITY
    # Name: 100, dtype: object
    print("----")
    print(df.iloc[123])
    # text        graced graced airwaves airwaves pearls pearls ...
    # class                                                   HUMAN
    # Name: 123, dtype: object

In [0]:
def create_train_test_set(df, train_size, test_size, random_state=42):
  """
  Task 1c)

  Split a DataFrame into training and test set.
  Use a method from scikit-learn to accomplish this.

  !!! You need to import this scikit-learn method! !!!

  Specify the train_size, test_size and random_state of this scikit-learn method
  to use the values passed into this function.

  Return in the following format (four return values):
    training_data, test_data, training_labels, test_labels

  :param df: The DataFrame as created by "build_dataframe_q()"
  :param train_size: The size of the training set.
  :param test_size: The size of the test set. Must add up to 1 with train_size.
    Example: train_size=0.3 and test_size=0.7 would be a 30:70 train:test split.
  :param random_state: Optional. Default: 42. The random state to ensure
    consistency across different function calls.
  :return: X_train, X_test, y_train, y_test - in other words: The training 
    samples, the test samples, the training sample labels, the test sample
    labels.
  """
  ### YOUR CODE HERE ###
  return X_train, X_test, y_train, y_test

In [0]:
if not grading:
    X_train, X_test, y_train, y_test = create_train_test_set(df, 0.6, 0.4)
    print(len(X_train))
    print(len(X_test))
    print(len(y_train))
    print(len(y_test))
    # 154
    # 103
    # 154
    # 103

In [0]:
def gridsearch(clf, parameters, X, y):
  """
  Task 1d)

  Perform a gridsearch for a given classifier, data and parameters.
  Print the best parameters and also return them.

  The parameters of this function are the same ones you can pass directly into
  the scikit-learn GridSearchCV: 
  https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

  :param clf: A classifier object.
  :param parameters: The parameters to be tried out for the gridsearch.
  :param X: The (training) samples used to optimize for.
  :param y: The (training) sample labels used to optimize for.
  :return: The best parameters, as found by the gridsearch, as a dictionary.
  """
  ### YOUR CODE HERE ###

In [0]:
def evaluate_classifier(true_labels, predictions):
  """
  Task 1e)

  true_labels = The class labels of your test set.
  predictions = The predicted labels, predictions made by your classifier.

  Evaluate how good the predictions of a classifier are.
  "true_labels" are the correct class labels, "predictions" is what the
  classifier predicted.

  Calculate the following:
    - accuracy
    - precision (weighted)
    - recall (weighted)
    - f1 (weighted)
  Return in the format (accuracy, precision, recall, f1)

  :param true_labels: The correct labels for the data.
  :param predictions: The labels as predicted by the classifier.
  :return: accuracy, precision, recall, f1
  """
  ### YOUR CODE HERE ###
  return accuracy, precision, recall, f1

In [0]:
def train_classifier(df, clf, do_gridsearch=False, parameters=None):
  """
  Task 1f)

  Train and evaluate a classifier.

    - Split the data (df) into training and test set, using your
      "create_train_test_set" method.
      Use a 60:40 training:test split.
    - create a vectorizer object. You can find vectorizers here:
      https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
      Pick one that you think is suitable. If you are unsure, pick any one of 
      them and evaluate your classifier. Then replace the vectorizer (change
      your code to use a different vectorizer and run the cell again) 
      and see how the results change.
      The task should be solvable with any of the vectorizers, but some of them
      are better suited for this task than others.
    - transform your training data using the .fit_transform() method of
      the vectorizer (read the scikit-learn documentation of your vectorizer
      to see how to call this method)
    - use the .fit method of the classifier to train your classifier
      Example here:
      https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier.fit
    - get the predictions for your test data using clf.predict()
      HINT: There is something you need to do with your test data beforehand.
            You had to do the same thing with your training data before you could
            use the .fit method..
    - calculate accuracy, precision, recall and f1-measure of your classifier
      based on the predictions of your test set and the vectorizer
      Use your "evaluate_classifier" method for this.
      Return these values in the format (accuracy, precision, recall, f1, vectorizer)
  
  :param df: The DataFrame as created by "build_dataframe_q".
  :param clf: A scikit-learn classifier.
  :param do_gridsearch: Optional. Default: False. Whether to perform a gridsearch.
  :param paramaters: Optional. Default: None. Only used if do_gridsearch=True.
    The parameters to use in the gridsearch.
  :return: accuracy, precision, recall, f1 (all 4 based on the classifier's 
    performance), vectorizer
  """
  return accuracy, precision, recall, f1, vectorizer

In [0]:
def build_pa(df):
  """
  Task 2a)

  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
  Train a PassiveAggressiveClassifier on the provided data.
  Accuracy, precision, recall and f1-measure should all be above 96%.

  Optimize the classifier parameters so the result of your "train_classifier"
  fucntion are all above 96%.

  HINT: Perform a gridsearch to find optimized parameters to improve your
        classifier.
  
  :return: accuracy, precision, recall, f1, vectorizer
  """
  clf = PassiveAggressiveClassifier(random_state=42)
  accuracy, precision, recall, f1, v = train_classifier(df, clf)
  print(accuracy, precision, recall, f1)
  pred = clf.predict(v.transform(["Who is Ghandi?"]))
  print(pred)
  return accuracy, precision, recall, f1, v

if not grading:
    q = load_questions()
    df = build_dataframe_q(q)
    accuracy, precision, recall, f1, v = build_pa(df)
    print(accuracy, precision, recall, f1)

In [0]:
def build_mnb(df):
  """
  Task 2b)

  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html 
  Train an MultinomialNB classifier on the provided data.
  Accuracy, precision, recall and f1-measure should all be above 79%.

  Optimize the classifier parameters so the result of your "train_classifier"
  fucntion are all above 79%.

  HINT: Perform a gridsearch to find optimized parameters to improve your
        classifier.

  :return: accuracy, precision, recall, f1, vectorizer
  """
  clf = MultinomialNB()
  accuracy, precision, recall, f1, v = train_classifier(df, clf)
  print(accuracy, precision, recall, f1)
  pred = clf.predict(v.transform(["Who is Ghandi?"]))
  print(pred)
  return accuracy, precision, recall, f1, v

if not grading:
    q = load_questions()
    df = build_dataframe_q(q)
    accuracy, precision, recall, f1, v = build_mnb(df)
    print(accuracy, precision, recall, f1)

In [0]:
def build_svc(df):
  """
  Task 2c)

  https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html  Train an SVC on the provided data.
  Accuracy, precision, recall and f1-measure should all be above 96%.

  Optimize the classifier parameters so the result of your "train_classifier"
  fucntion are all above 96%.

  HINT: Perform a gridsearch to find optimized parameters to improve your
        classifier.

  :return: accuracy, precision, recall, f1, vectorizer
  """
  clf = SVC()
  accuracy, precision, recall, f1, v = train_classifier(df, clf)
  print(accuracy, precision, recall, f1)
  pred = clf.predict(v.transform(["Who is Ghandi?"]))
  print(pred)
  return accuracy, precision, recall, f1, v

if not grading:
    q = load_questions()
    df = build_dataframe_q(q)
    accuracy, precision, recall, f1, v = build_svc(df)
    print(accuracy, precision, recall, f1)