https://www.kaggle.com/c/cs5785-fall19-final/

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import csv
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import sklearn
from os import listdir
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### Import Files

In [2]:
def get_flat_descriptions_from_folder():
    descriptions = []
    for i in range(10000):
        with open(('cs5785-fall19-final/descriptions_train/{}.txt').format(i), newline='') as f:
            desc1 = []
            reader = csv.reader(f)
            for row in reader:
                desc1.append(row)
            descriptions.append(desc1)
    
    # FIRST FLATTENING
    descriptions2 = []
    for description in descriptions:
        descriptions2.append([desc for sublist in description for desc in sublist])
        
    # MADE THE SENTENCES ALL ONE FOR EACH DESCRIPTION FILE
    flat_descriptions = []
    for description in descriptions2:
        desc1 = []
        for sentence in description:
            desc1 += sentence.split(' ')
        flat_descriptions.append(desc1)
    
    for i in range(len(flat_descriptions)):
        flat_descriptions[i] = (' ').join(flat_descriptions[i]).lower()
    
    return flat_descriptions

In [3]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
        
    return data

In [4]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder()
test_flat_descs  = get_flat_descriptions_from_folder()

In [5]:
# [  [ processed description of image 1  ] , [ processed description of image 2  ]   , ...  ]
train_descs = preprocessing(train_flat_descs)
test_descs  = preprocessing(test_flat_descs)

#### Bag Of Words Features

In [6]:
# CREATE THE BAG OF WORDS DICTIONARY
def create_bow(train_descs):
    BOW = {}
    for description in train_descs:
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            BOW[word] = 0
    BOW['null'] = 0
    return BOW

In [7]:
def create_bow_vectors(data, BOW):
    feature_vectors = []
    for description in data:
        feat_vec = BOW.copy()
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            if word in feat_vec:
                feat_vec[word] += 1
            else:
                feat_vec['null'] += 1       
        feature_vectors.append(feat_vec)

    # TURN DICTIONARIES INTO A MATRIX with each row as one description
    feature_vector_matrix = []
    for feature_vec in feature_vectors:
        feature_vector_matrix.append(list(feature_vec.values()))

    # NORMALIZE THE FEATURES
    feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm
    return feature_vector_matrix

In [8]:
BOW = create_bow(train_descs)

In [9]:
# CREATE FEATURE VECTORS for TRAIN
feature_vector_matrix = create_bow_vectors(train_descs, BOW)

In [10]:
feature_vector_matrix_test = create_bow_vectors(test_descs, BOW)

#### Word to Vec Features

In [None]:
## NEED TO LOOK INTO THIS OPTION

#### Get Image Features

In [11]:
train_feat = pd.read_csv("cs5785-fall19-final/features_train/features_resnet1000_train.csv", header = None, index_col=None)
test_feat = pd.read_csv("cs5785-fall19-final/features_test/features_resnet1000_test.csv", header = None, index_col=None)

In [12]:
for i in range(len(train_feat[0])):
    train_feat[0][i] = int(train_feat[0][i].replace("images_train/", "").replace(".jpg", ""))

train_feat_sort = train_feat.sort_values(by=0)

for i in range(len(test_feat[0])):
    test_feat[0][i] = int(test_feat[0][i].replace("images_test/", "").replace(".jpg", ""))

test_feat_sort = test_feat.sort_values(by=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Try Multilayer Perceptron

In [19]:
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score

cv = KFold(3)

X = np.array(train_descs)
y = train_feat_sort.drop(columns=0).to_numpy()
accuracies = []

for train, test in cv.split(X, y):
    BOW = create_bow(X[train])
    X_train = create_bow_vectors(X[train], BOW)
    X_test = create_bow_vectors(X[test], BOW)
    clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
    clf.fit(X_train, y[train])
    accuracies.append(clf.score(X_test, y[test]))



In [20]:
accuracies

[0.3906616631528084, 0.41478049752441326, 0.382613475489045]

## Do MLP with Proper KNN Scoring

In [21]:
# function to return the number of nearest neighbors specified
def knn_function(data, point, neighbors_number):
    knn = NearestNeighbors(n_neighbors=neighbors_number, algorithm='ball_tree')
    knn.fit(data)
    return knn.kneighbors(point, neighbors_number, False)

In [22]:
def scores(i):
    return (20+1-i)/20

In [23]:
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score

cv = KFold(3)

X = np.array(train_descs)
y = train_feat_sort.drop(columns=0).to_numpy()
accuracies = []

for train, test in cv.split(X, y):
    BOW = create_bow(X[train])
    X_train = create_bow_vectors(X[train], BOW)
    X_test = create_bow_vectors(X[test], BOW)
    clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
    clf.fit(X_train, y[train])
    predictions = clf.predict(X_test)
    neighbors = knn_function(y[test], predictions, 20)
    scs = []
    for i in range(len(predictions)):
        good = False
        for j, n in enumerate(neighbors[i]):
            if i == n:
                scs.append(scores(j))
                good = True
                break
        if good == False:
            scs.append(0)
    accuracies.append(sum(scs)/len(predictions))

In [24]:
accuracies

[0.09013197360527894, 0.10840084008400842, 0.07253225322532253]

### Try Logistic Regression

In [89]:
# function to train logistic regression model, predict with it, and calculate the accuracy and confusion matrix
def Logistic_Regression(x_train, y_train, x_test, y_test):
    
    # initialize and fit logistic regression model with training data
    lr = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
    lr.fit(x_train, y_train)
    
    # predict the result for the testing data
    lr_pred = lr.predict(x_test) 
    
    # calculate accuracy
    lr_acc = accuracy_score(lr_pred, y_test)
    
    # confusion matrix
    cfn_matrix_lr = confusion_matrix(y_test, lr_pred)
   
    # return accuracy and confusion matrix
    return lr_acc, cfn_matrix_lr

In [90]:
cv = KFold(3)

X = feature_vector_matrix
y = train_feat_sort.drop(columns=0).to_numpy()
accuracies = []

for train, test in cv.split(X, y):
    acc, cfn = Logistic_Regression(X[train], y[train], X[test], y[test])
    accuracies.append(acc)

ValueError: bad input shape (6666, 1000)

In [None]:
accuracies