In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import csv

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


stopWords = stopwords.words('english')
isStopWord = lambda w: w in stopWords or len(w) == 1

In [2]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]

In [3]:
def getFilesFromFolder(folder):
    return listdir(folder)

In [4]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        
        # remove punctuation
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        """
        Analyzing if words are upper/lower case is more for analyzing the intensity of the sentiment rather than classifying it. 
        """
        word_tokens = list(map(lambda r: r.lower(), word_tokenize(data[i])))
        
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
    
    assert(len(data) == 1)
    return set(data[0].split())

In [5]:
def get_flat_descriptions_from_folder(folder):
    flat_descriptions = []
    
    amtFiles = len(getFilesFromFolder(folder))
    
    indexes = list(sorted([str(i) for i in range(amtFiles)]))
    
    for index in indexes:
        filename = folder + "/{}.txt".format(index)
        
        with open(filename, 'r') as d_file:
            flat_descriptions += [[' '.join(d_file.readlines()).replace("\n", "")]]
    
    return flat_descriptions

In [6]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder(descTrainFolder)
test_flat_descs  = get_flat_descriptions_from_folder(descTestFolder)

In [28]:
def unique_words_in_desc(desc):
    return np.array([list(preprocessing(d)) for d in desc])

In [29]:
# [  { processed description of image 1  } , { processed description of image 2  }   , ...  ]
train_descs = unique_words_in_desc(train_flat_descs)
test_descs  = unique_words_in_desc(test_flat_descs)

In [19]:
def get_feat_from_file(filename):
    with open(filename, 'r') as f:
        feats = list(csv.reader(f))
    sorted_feats = list(sorted(feats, key= lambda l: l[0]))
    
    return np.array([s[1:] for s in sorted_feats])

In [24]:
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = get_feat_from_file(resTrainFile)
    

resTestFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = get_feat_from_file(resTestFile)

In [38]:
# function to train logistic regression model, predict with it, and calculate the accuracy and confusion matrix
def Logistic_Regression(x_train, y_train, x_test, y_test):
    
    # initialize and fit logistic regression model with training data
    lr = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
    
    for i in range(len(y_train)):
        lr.partial_fit(x_train[i], y_train[i])
    
    # predict the result for the testing data
    lr_pred = lr.predict(x_test) 
    
    # calculate accuracy
    lr_acc = accuracy_score(lr_pred, y_test)
    
    # confusion matrix
    cfn_matrix_lr = confusion_matrix(y_test, lr_pred)
   
    # return accuracy and confusion matrix
    return lr_acc,cfn_matrix_lr

In [39]:
acc, _ = Logistic_Regression(train_descs, train_feat, test_descs, test_feat)

AttributeError: 'LogisticRegression' object has no attribute 'partial_fit'

In [56]:
# function to train gaussian naive bayes model, predict with it, and calculate the accuracy and confusion matrix
def Gaussian_NB(x_train, y_train, x_test, y_test):
    
    # initialize and fit naive bayes model gaussian prior with training data
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    # predict the results for the test set
    gnb_pred = gnb.predict(x_test) 
    
    # calculate accuracy
    gnb_acc = accuracy_score(gnb_pred, y_test)
    
    # confusion matrix
    cfn_matrix_gnb = confusion_matrix(y_test, gnb_pred)
    
    # return accuracy and confusion matrix
    return gnb_acc, cfn_matrix_gnb

In [57]:
acc, _ = Gaussian_NB(train_descs, train_feat, test_descs, test_feat)

ValueError: setting an array element with a sequence.

In [36]:
train_feat.shape

(10000, 1000)

In [37]:
train_descs.shape

(10000,)