https://www.kaggle.com/c/cs5785-fall19-final/

In [81]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor

stopWords = stopwords.words('english')
isStopWord = lambda w: w in stopWords or len(w) == 1

In [55]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]

In [56]:
def getFilesFromFolder(folder):
    return listdir(folder)

In [77]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        
        # remove punctuation
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = list(map(lambda r: r.lower(), word_tokenize(data[i])))
            
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
    
    assert(len(data) == 1)
    return data[0]

In [64]:
def get_flat_descriptions_from_folder(folder):
    flat_descriptions = []
    
    for index in range(len(getFilesFromFolder(folder))):
        filename = folder + "/{}.txt".format(index)
        
        with open(filename, 'r') as d_file:
            flat_descriptions += [[' '.join(d_file.readlines()).replace("\n", "")]]
    
    return flat_descriptions

In [65]:
flat_descs_train = get_flat_descriptions_from_folder(descTrainFolder)
flat_descs_test = get_flat_descriptions_from_folder(descTestFolder)

In [75]:
train_descs = [' '.join(set(preprocessing(desc).split())) for desc in flat_descs_train]
test_descs = [' '.join(set(preprocessing(desc).split())) for desc in flat_descs_test]

In [69]:
# training data
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = pd.read_csv(resTrainFile, header = None, index_col = None)

# testing data
resTrainFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = pd.read_csv(resTrainFile, header = None, index_col = None)

In [49]:
# function to train logistic regression model, predict with it, and calculate the accuracy and confusion matrix
def Logistic_Regression(x_train, y_train, x_test, y_test):
    
    # initialize and fit logistic regression model with training data
    lr = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
    lr.fit(x_train, y_train)
    
    # predict the result for the testing data
    lr_pred = lr.predict(x_test) 
    
    # calculate accuracy
    lr_acc = accuracy_score(lr_pred, y_test)
    
    # confusion matrix
    cfn_matrix_lr = confusion_matrix(y_test, lr_pred)
   
    # return accuracy and confusion matrix
    return lr_acc,cfn_matrix_lr

In [48]:
# function to train gaussian naive bayes model, predict with it, and calculate the accuracy and confusion matrix
def Gaussian_NB(x_train, y_train, x_test, y_test):
    
    # initialize and fit naive bayes model gaussian prior with training data
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    # predict the results for the test set
    gnb_pred = gnb.predict(x_test) 
    
    # calculate accuracy
    gnb_acc = accuracy_score(gnb_pred, y_test)
    
    # confusion matrix
    cfn_matrix_gnb = confusion_matrix(y_test, gnb_pred)
    
    # return accuracy and confusion matrix
    return gnb_acc, cfn_matrix_gnb

In [83]:
# function to train random forest regressor model, predict with it, and calculate the accuracy and confusion matrix
def Random_Forest(x_train, y_train, x_test, y_test):
    # set up regressor  
    rf_regressor = RandomForestRegressor(max_depth=20) 
    
    # fit regressor 
    rf_regressor.fit(x_train, y_train)
    
    # predict
    rf_pred = regressor.predict(x_test)
    
    # calculate accuracy
    rf_acc = accuracy_score(rf_pred, y_test)
    
    # confusion matrix
    cfn_matrix_rf = confusion_matrix(y_test, rf_pred)
    
    # return accuracy and confusion matrix
    return rf_acc, cfn_matrix_rf

In [None]:
# function to train KNN regressor model, predict with it, and calculate the accuracy and confusion matrix
def KNN_Regressor(x_train, y_train, x_test, y_test):
    # set up regressor  
    knn_regressor = KNeighborsRegressor(max_depth=20) 
    
    # fit regressor 
    knn_regressor.fit(x_train, y_train)
    
    # predict
    rf_pred = knn_regressor.predict(x_test)
    
    # calculate accuracy
    rf_acc = accuracy_score(rf_pred, y_test)
    
    # confusion matrix
    cfn_matrix_rf = confusion_matrix(y_test, rf_pred)
    
    # return accuracy and confusion matrix
    return rf_acc, cfn_matrix_rf

In [51]:
# function to return the number of nearest neighbors specified
def knn_function(data, neighbors_number):
    nbrs = NearestNeighbors(n_neighbors=neighbors_number, algorithm='ball_tree').fit(data)
    return nbrs

In [84]:
#lr_acc, cfn_matrix_lr = Logistic_Regression(train_descs, train_feat, test_descs, test_feat)
#gnb_acc, cfn_matrix_gnb = Gaussian_NB(train_descs, train_feat, test_descs, test_feat)
rf_acc, cfn_matrix_rf = Random_Forest(train_descs, train_feat, test_descs, test_feat)



ValueError: could not convert string to float: 'crowd skateboard put person pull skate trick picnic top rid watch skateboarder boarder stage use man show table'