In [321]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

In [322]:
dataset = pd.read_csv("Ciolacu_C_Florentina_Neluța_train.csv")

In [323]:
print(dataset)

         Breed Name   Weight(g)  Height(cm)  Longevity(yrs) Energy level  \
0             Boxer  26543.5371     57.4573         10.0995         high   
1             Boxer  26025.6217         NaN          9.5560         high   
2    French Bulldog  11010.6222     29.7543         11.0604          med   
3             Boxer  30407.2704     60.7385          9.4011         high   
4        Rottweiler  43298.9210         NaN          8.5136          med   
5        Rottweiler  45545.6336     64.0186          9.7858          med   
6    French Bulldog  10742.5071     28.5554          9.8898          med   
7        Rottweiler  44622.3142     64.0519          9.1423          med   
8             Boxer  28024.3844     56.6882          7.2638         high   
9             Boxer  32488.1735     56.6639          8.8175         high   
10            Boxer  28467.4694     61.6554          9.7829         high   
11   French Bulldog  12804.3933     29.1849         10.0110          med   
12          

In [324]:
# Task 1

In [325]:
# Getting the list of all "Breed Names" and encoding them so to not use Strings
breed_list = set(dataset['Breed Name'].tolist())
breed_encoding = {}
for breed_name, index in zip(breed_list, range(len(breed_list))):
    breed_encoding[breed_name] = index
modified_dataset = dataset
modified_dataset['Breed Name'] = modified_dataset['Breed Name'].map(lambda el: breed_encoding.get(el))
# getting rid of Longevity column
modified_dataset = modified_dataset.drop('Longevity(yrs)', axis=1)
# Also getting rid of Owner Name (makes no sense for us) - Results don't depend on Owner Name
modified_dataset = modified_dataset.drop('Owner Name', axis=1)
print(modified_dataset)

     Breed Name   Weight(g)  Height(cm) Energy level Attention Needs  \
0             3  26543.5371     57.4573         high             med   
1             3  26025.6217         NaN         high            high   
2             2  11010.6222     29.7543          med            high   
3             3  30407.2704     60.7385         high            high   
4             1  43298.9210         NaN          med            high   
5             1  45545.6336     64.0186          med            high   
6             2  10742.5071     28.5554          med            high   
7             1  44622.3142     64.0519          med            high   
8             3  28024.3844     56.6882         high            high   
9             3  32488.1735     56.6639         high            high   
10            3  28467.4694     61.6554         high            high   
11            2  12804.3933     29.1849          med            high   
12            3  32107.8586     61.9592         high            

In [326]:
# Handling Missing Data
print(modified_dataset.isnull().sum())
modified_dataset = modified_dataset.dropna() # drop missing data
modified_dataset = modified_dataset.reset_index(drop=True)

Breed Name           0
Weight(g)            0
Height(cm)         184
Energy level         0
Attention Needs      0
Coat Lenght          0
Sex                  0
dtype: int64


In [327]:
# Handling Categorical Data
# The Categorical Data is handled using One Hot Encoder.
categorical_columns = ["Energy level", "Attention Needs", "Coat Lenght", "Sex"]
for column in categorical_columns:
    label_encoder = LabelEncoder()
    ohe_encoder = OneHotEncoder()
    unique_columns = set(dataset[column].tolist())
    unique_columns = {key: value for key, value in zip(range(len(unique_columns)), unique_columns)}
    modified_dataset[column + "_encoded"] = label_encoder.fit_transform(modified_dataset[column])
    aux = ohe_encoder.fit_transform(modified_dataset[column + "_encoded"].values.reshape(-1, 1)).toarray()
    aux = pd.DataFrame(aux, columns=[column + "_" + unique_columns.get(i) for i in range(aux.shape[1])])
    modified_dataset = pd.concat([modified_dataset, aux], axis=1)
    modified_dataset = modified_dataset.drop(column, axis=1)
    modified_dataset = modified_dataset.drop(column + "_encoded", axis=1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [328]:
# Normalization (Just for Weight(g) and Height(cm)). I am using Min Max Normalizer
# (X - min(X)) / (max(X) - min(X))
modified_dataset["Weight(g)"] = (modified_dataset["Weight(g)"] - modified_dataset["Weight(g)"].min()) / (modified_dataset["Weight(g)"].max() - modified_dataset["Weight(g)"].min())
modified_dataset["Height(cm)"] = (modified_dataset["Height(cm)"] - modified_dataset["Height(cm)"].min()) / (modified_dataset["Height(cm)"].max() - modified_dataset["Height(cm)"].min())

In [329]:
print(modified_dataset)

     Breed Name  Weight(g)  Height(cm)  Energy level_low  Energy level_med  \
0             3   0.400664    0.811784               1.0               0.0   
1             2   0.153204    0.219072               0.0               0.0   
2             3   0.462218    0.881986               1.0               0.0   
3             1   0.703393    0.952165               0.0               0.0   
4             2   0.148932    0.193421               0.0               0.0   
5             1   0.688683    0.952877               0.0               0.0   
6             3   0.424256    0.795329               1.0               0.0   
7             3   0.495370    0.794809               1.0               0.0   
8             3   0.431315    0.901603               1.0               0.0   
9             2   0.181781    0.206890               0.0               0.0   
10            3   0.489311    0.908103               1.0               0.0   
11            2   0.170286    0.178515               1.0        

In [330]:
# Vizualizarea Datelor
modified_dataset["Height(cm)"] = modified_dataset["Height(cm)"].apply(np.log)
modified_dataset
# modified_dataset.hist()

Unnamed: 0,Breed Name,Weight(g),Height(cm),Energy level_low,Energy level_med,Energy level_high,Attention Needs_low,Attention Needs_med,Attention Needs_high,Coat Lenght_long,Coat Lenght_med,Coat Lenght_short,Sex_male,Sex_female
0,3,0.400664,-0.208521,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,2,0.153204,-1.518354,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,3,0.462218,-0.125579,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1,0.703393,-0.049017,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,2,0.148932,-1.642884,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5,1,0.688683,-0.048269,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
6,3,0.424256,-0.228999,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7,3,0.495370,-0.229653,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
8,3,0.431315,-0.103581,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,2,0.181781,-1.575570,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [333]:
modified_dataset.hist()

AttributeError: 'DataFrame' object has no attribute 'isnan'

In [255]:
y = modified_dataset["Breed Name"]
test = modified_dataset.drop('Breed Name', axis=1)
x_train, x_test, y_train, y_test = train_test_split(test, y)

In [256]:
def plot_decision_boundary(model, X, y):
  """
    Use this to plot the decision boundary of a trained model.
  """
  
  xx, yy = np.mgrid[-5:5:.01, -5:5:.01]
  grid = np.c_[xx.ravel(), yy.ravel()]
  probs = model.predict_proba(grid)[:, 1].reshape(xx.shape)
  f, ax = plt.subplots(figsize=(8, 6))
  contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu",
                        vmin=0, vmax=1)
  ax_c = f.colorbar(contour)
  ax_c.set_label("$P(y = 1)$")
  ax_c.set_ticks([0, .25, .5, .75, 1])

  ax.scatter(X[:,0], X[:, 1], c=y, s=50,
             cmap="RdBu", vmin=-.2, vmax=1.2,
             edgecolor="white", linewidth=1)

  ax.set(aspect="equal",
         xlim=(-5, 5), ylim=(-5, 5),
         xlabel="$X_1$", ylabel="$X_2$")

In [257]:
# testing Logistic Regression
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(x_train, y_train)
accuracy = accuracy_score(model.predict(x_test), y_test)
print(LogisticRegression.__name__, accuracy)
  

LogisticRegression 0.8480392156862745




In [258]:
# testing Random Forests
model = RandomForestClassifier()
model.fit(x_train, y_train)
accuracy = accuracy_score(model.predict(x_test), y_test)
print(RandomForestClassifier.__name__, accuracy)

RandomForestClassifier 0.9901960784313726




In [259]:
# testing KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=2)
model.fit(x_train, y_train)
accuracy = accuracy_score(model.predict(x_test), y_test)
print(KNeighborsClassifier.__name__, accuracy)

KNeighborsClassifier 0.946078431372549


In [260]:
# Task 2

In [262]:
dataset = pd.read_csv("Holteiu_N_Daniel_Ninel_train.csv")

In [263]:
# get rid of Breed Name
modified_dataset = dataset
modified_dataset = modified_dataset.drop('Breed Name', axis=1)
modified_dataset = modified_dataset.drop('Owner Name', axis=1)
print(modified_dataset)

      Weight(g)  Height(cm)  Longevity(yrs) Energy level Attention Needs  \
0    26543.5371     57.4573         10.0995         high             med   
1    26025.6217         NaN          9.5560         high            high   
2    11010.6222     29.7543         11.0604          med            high   
3    30407.2704     60.7385          9.4011         high            high   
4    43298.9210         NaN          8.5136          med            high   
5    45545.6336     64.0186          9.7858          med            high   
6    10742.5071     28.5554          9.8898          med            high   
7    44622.3142     64.0519          9.1423          med            high   
8    28024.3844     56.6882          7.2638         high            high   
9    32488.1735     56.6639          8.8175         high            high   
10   28467.4694     61.6554          9.7829         high            high   
11   12804.3933     29.1849         10.0110          med            high   
12   32107.8

In [264]:
# Handling Missing Data
print(modified_dataset.isnull().sum())
modified_dataset = modified_dataset.dropna() # drop missing data
modified_dataset = modified_dataset.reset_index(drop=True)

Weight(g)            0
Height(cm)         184
Longevity(yrs)       0
Energy level         0
Attention Needs      0
Coat Lenght          0
Sex                  0
dtype: int64


In [265]:
# Handling Categorical Data
# The Categorical Data is handled using One Hot Encoder.
categorical_columns = ["Energy level", "Attention Needs", "Coat Lenght", "Sex"]
for column in categorical_columns:
    label_encoder = LabelEncoder()
    ohe_encoder = OneHotEncoder()
    unique_columns = set(dataset[column].tolist())
    unique_columns = {key: value for key, value in zip(range(len(unique_columns)), unique_columns)}
    modified_dataset[column + "_encoded"] = label_encoder.fit_transform(modified_dataset[column])
    aux = ohe_encoder.fit_transform(modified_dataset[column + "_encoded"].values.reshape(-1, 1)).toarray()
    aux = pd.DataFrame(aux, columns=[column + "_" + unique_columns.get(i) for i in range(aux.shape[1])])
    modified_dataset = pd.concat([modified_dataset, aux], axis=1)
    modified_dataset = modified_dataset.drop(column, axis=1)
    modified_dataset = modified_dataset.drop(column + "_encoded", axis=1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [266]:
# Normalization (Just for Weight(g) and Height(cm)). I am using Min Max Normalizer
# (X - min(X)) / (max(X) - min(X))
modified_dataset["Weight(g)"] = (modified_dataset["Weight(g)"] - modified_dataset["Weight(g)"].min()) / (modified_dataset["Weight(g)"].max() - modified_dataset["Weight(g)"].min())
modified_dataset["Height(cm)"] = (modified_dataset["Height(cm)"] - modified_dataset["Height(cm)"].min()) / (modified_dataset["Height(cm)"].max() - modified_dataset["Height(cm)"].min())

In [267]:
modified_dataset

Unnamed: 0,Weight(g),Height(cm),Longevity(yrs),Energy level_low,Energy level_med,Energy level_high,Attention Needs_low,Attention Needs_med,Attention Needs_high,Coat Lenght_long,Coat Lenght_med,Coat Lenght_short,Sex_male,Sex_female
0,0.400664,0.811784,10.0995,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.153204,0.219072,11.0604,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.462218,0.881986,9.4011,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.703393,0.952165,9.7858,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.148932,0.193421,9.8898,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5,0.688683,0.952877,9.1423,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
6,0.424256,0.795329,7.2638,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7,0.495370,0.794809,8.8175,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
8,0.431315,0.901603,9.7829,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,0.181781,0.206890,10.0110,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [268]:
y = modified_dataset["Longevity(yrs)"]
test = modified_dataset.drop('Longevity(yrs)', axis=1)
x_train, x_test, y_train, y_test = train_test_split(test, y)

In [269]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures

model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)

In [270]:
y_test = y_test.values

In [271]:
for y1, y2 in zip(y_test, predictions):
    print("y_test = ", y1, " y_predicted = ", y2, " Difference = ", abs(y1 - y2))

y_test =  10.26  y_predicted =  9.15625  Difference =  1.1037499999999998
y_test =  9.5303  y_predicted =  9.84375  Difference =  0.31344999999999956
y_test =  8.5406  y_predicted =  10.09375  Difference =  1.5531500000000005
y_test =  13.3059  y_predicted =  12.609375  Difference =  0.6965249999999994
y_test =  9.4372  y_predicted =  8.828125  Difference =  0.6090750000000007
y_test =  13.292  y_predicted =  12.6875  Difference =  0.6044999999999998
y_test =  10.3376  y_predicted =  8.875  Difference =  1.4626000000000001
y_test =  9.3195  y_predicted =  9.046875  Difference =  0.2726249999999997
y_test =  11.5131  y_predicted =  9.484375  Difference =  2.0287249999999997
y_test =  12.8782  y_predicted =  12.421875  Difference =  0.45632499999999965
y_test =  13.9095  y_predicted =  11.5625  Difference =  2.3469999999999995
y_test =  9.962  y_predicted =  11.046875  Difference =  1.0848750000000003
y_test =  9.8154  y_predicted =  9.296875  Difference =  0.5185250000000003
y_test =  8

<zip at 0x10e542448>