In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import model_selection
from sklearn import feature_extraction
import codecs
import json
import os

In [2]:
IMPORT = True
EXPORT = True
DISPLAY = True
VERBOSE = False
DATA_DIR = "data/"
NARRATIVE_DIR = "data/pizza_request_dataset/narratives"
SEED = 221

In [3]:
def get_datasets(type='medium'):
    if type not in ['medium', 'all', 'mini']:
        raise Exception("Invalid type!")
    train_full, val_full = pd.read_csv(DATA_DIR + f'train_{type}.csv'), pd.read_csv(DATA_DIR + f'val_{type}.csv')
    train_request_text, train_y = train_full['request_text'], train_full['requester_received_pizza']
    val_request_text, val_y  = val_full['request_text'], val_full['requester_received_pizza']
    return train_request_text, train_y, val_request_text, val_y

In [4]:
train_request_text, train_y, val_request_text, val_y = get_datasets()

In [5]:
train_y.shape, val_y.shape

((330,), (110,))

In [6]:
def ngram_solver(train_request_text, train_y, val_request_text, test_y, ngram_size=1): 
    vectorizer = feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size))
    vectorizer.fit(train_request_text) # build ngram dictionary
    train_features = vectorizer.transform(train_request_text) 
    val_features = vectorizer.transform(val_request_text)
    logreg = LogisticRegression()
    logreg.fit(train_features, train_y)
    y_pred = logreg.predict(val_features)
    score = roc_auc_score(val_y,y_pred)
    print(f'ROC AUC score for {ngram_size}-gram: {score}')

In [7]:
ngram_solver(train_request_text, train_y, val_request_text, val_y, ngram_size=1)

ROC AUC score for 1-gram: 0.537929495760821




In [8]:
ngram_solver(train_request_text, train_y, val_request_text, val_y, ngram_size=2)

ROC AUC score for 2-gram: 0.5185185185185185




In [9]:
ngram_solver(train_request_text, train_y, val_request_text, val_y, ngram_size=3)

ROC AUC score for 3-gram: 0.5


