In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import matplotlib.pyplot as plt
import json
import csv
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from mpl_toolkits.mplot3d import Axes3D 

print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Created dictionary of each pet id with its corresponding documentSentiment magnitude
pets_sentiment = {}
for filename in os.listdir('../input/train_sentiment'):
    with open('../input/train_sentiment/' + filename) as f:
        petid = filename.split('.')[0]
        data = json.load(f)
        overall_sentiment = data['documentSentiment']['score']
        pets_sentiment[petid] = overall_sentiment

In [None]:
# Added the associated speed for each pet
with open("../input/train/train.csv") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['PetID'] in pets_sentiment:
                pets_sentiment[row['PetID']] = (pets_sentiment[row['PetID']], row['AdoptionSpeed'])

In [None]:
# Found the avg sentiment magnitude for each adoption speed class. Plotted all the data corresponding to speed
sentiments, speeds = zip(*pets_sentiment.values())
int_speeds = list(map(int, speeds))
avg_sentiments = {0:[0],1:[0],2:[0],3:[0],4:[0]}
for i in range(len(int_speeds)):
    avg_sentiments[int_speeds[i]].append(sentiments[i])
for sp in avg_sentiments:
    avg_sentiments[sp] = sum(avg_sentiments[sp])/(len(avg_sentiments[sp])-1)
print(avg_sentiments)
actual_avgs = list(avg_sentiments.values())
fig = plt.figure()
plt.scatter(int_speeds, sentiments)
plt.scatter([0,1,2,3,4], actual_avgs, marker='^')

In [None]:
# Normalizing the data
df = pd.read_csv("../input/train/train.csv", sep=",")
labels = df['AdoptionSpeed']
df = df.drop(['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed'], axis=1)

cols_to_norm = ['Age','VideoAmt', 'PhotoAmt', 'Fee']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

normalized_df = (df - df.min())/(df.max() - df.min())
normalized_df['AdoptionSpeed'] = labels
df.head()
df.to_csv("train_clean.csv", index=False)
normalized_df.to_csv("train_all_norm.csv", index=False)


In [None]:
# Running PCA to get data to 3-D
pca = PCA(3)
newdata = pca.fit_transform(df)
ss = StandardScaler()
df_pca = pca.fit_transform(ss.fit_transform(df))

principalDf = pd.DataFrame(data=df_pca,
                          columns= ['pc1', 'pc2', 'pc3'])
finalDf = pd.concat([principalDf, labels], axis=1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)
targets = [0, 1, 2, 3, 4]
colors = ['r', 'g', 'b', 'y', 'm']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['AdoptionSpeed'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'pc1']
               , finalDf.loc[indicesToKeep, 'pc2']
               , finalDf.loc[indicesToKeep, 'pc3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()