In [25]:
# Change directory to be where my code/data is
import os
os.chdir("/Users/Clair/machine_learning_sp20")

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statistics as stats
import numpy as np
import pandas as pd

In [258]:
####################################################

In [27]:
#####################################################
# CAR EVALUATION--CLASSIFICATION
# TARGET: classify the car as acceptable, unacceptable, good, or very good. (last column)
#####################################################
car_columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "condition"]
car_data = pd.read_csv("car.data", delimiter=",", names=car_columns, header=None)


In [28]:
# Label or one-hot encode the data
car_data.buying.value_counts()
car_data.buying = car_data.buying.astype('category')
car_data["buying_cat"]= car_data.buying.cat.codes

car_data.maint.value_counts()
car_data.maint = car_data.maint.astype('category')
car_data["maint_cat"]= car_data.maint.cat.codes

car_data = pd.get_dummies(car_data, columns=["doors"])
car_data = pd.get_dummies(car_data, columns=["persons"])

car_data.lug_boot.value_counts()
car_data.lug_boot = car_data.lug_boot.astype('category')
car_data["lug_boot_cat"]= car_data.lug_boot.cat.codes

car_data = pd.get_dummies(car_data, columns=["safety"])

car_data.condition.value_counts()
car_data.condition = car_data.condition.astype('category')
car_data["cond_cat"]= car_data.condition.cat.codes

In [29]:
car_targets = car_data["condition"]
car_data = car_data.drop(columns=["buying", "maint", "lug_boot", "condition"])

In [30]:
train_data, test_data, train_targets, test_targets = train_test_split(car_data, car_targets, test_size = 0.3, shuffle = True)

In [31]:
# Create the classifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
classifier.fit(train_data, train_targets)

# Make predictions on the test data
predictions = classifier.predict(test_data)

# Compute and print the accuracy
accuracy = accuracy_score(test_targets, predictions)
print("The classification classifier for the CAR DATA achieved {}% accuracy)".format(round(100 * accuracy), 4))


The classification classifier for the CAR DATA achieved 100.0% accuracy)


In [282]:
##################################################################

In [32]:
##################################################################
# AUTOMOBILE MPG--REGRESSION
# TARGET: the MPG column (first column)
##################################################################
mpg_columns = ["mpg", "cylinders", "displacement","horsepower", "weight", "acceleration", "model year","origin", "car name"]
mpg_data = pd.read_csv("auto-mpg.data", sep="\s+", names=mpg_columns, na_values=["?"])

In [33]:
#Set the targets
mpg_target = mpg_data["mpg"]

#Get rid of the target and the unneeded columns
data = mpg_data.drop(columns=["mpg"])
data = mpg_data.drop(columns=["car name"])

In [34]:
# Replace missing horsepower values with the mean horsepower
data[data.isna().any(axis=1)] # shows records with NA's
data.horsepower = data.horsepower.fillna(data["horsepower"].mean())

In [35]:
# Get the sets!
train_data, test_data, train_targets, test_targets = train_test_split(data, mpg_target, test_size = 0.3, shuffle = True)

In [36]:
# Train and report!
regr = KNeighborsRegressor(n_neighbors=5)
regr.fit(train_data, train_targets)
print("The regression classifier for the MPG data achieved", 100 * round(regr.score(test_data, test_targets), 4), "% accuracy")


The regression classifier for the MPG data achieved 71.66 % accuracy


In [None]:
##################################################################

In [37]:
##################################################################
# STUDENT PERFORMANCE--REGRESSION
# TARGET: the final grade. (last column--G3)
##################################################################
student_data = pd.read_csv("student-mat.csv", sep=";")

In [38]:
# Change the data types to be one-hot encoded or T/F columns
student_data["isMale"] = student_data.sex.map({"M": 1, "F": 0})

student_data["hasInternet"] = student_data.internet.map({"yes": 1, "no": 0})

student_data["famSizeGT3"] = student_data.famsize.map({"GT3": 1, "LE3": 0})

student_data["hasFamSup"] = student_data.famsup.map({"yes": 1, "no": 0})

student_data["hasSchoolSup"] = student_data.schoolsup.map({"yes": 1, "no": 0})

student_data["hasRomance"] = student_data.romantic.map({"yes": 1, "no": 0})

student_data["isPaid"] = student_data.paid.map({"yes": 1, "no": 0})

student_data["hasActivities"] = student_data.activities.map({"yes": 1, "no": 0})

student_data["hadNursery"] = student_data.nursery.map({"yes": 1, "no": 0})

student_data["hasHigher"] = student_data.higher.map({"yes": 1, "no": 0})

student_data["addressType"] = student_data.address.map({"U": 1, "R": 0})

student_data["pStatus"] = student_data.Pstatus.map({"T": 1, "A": 0})

student_data["isGPstudent"] = student_data.school.map({"GP": 1, "MS": 0})

#Guardian type
student_data.guardian.value_counts()
student_data.guardian = student_data.guardian.astype('category')
student_data["guardian_cat"]= student_data.guardian.cat.codes

#Mother job
student_data.Mjob.value_counts()
student_data.Mjob = student_data.Mjob.astype('category')
student_data["mJob_cat"]= student_data.Mjob.cat.codes

#Mother job
student_data.Fjob.value_counts()
student_data.Fjob = student_data.Fjob.astype('category')
student_data["fJob_cat"]= student_data.Fjob.cat.codes

#Mother job
student_data.reason.value_counts()
student_data.reason = student_data.reason.astype('category')
student_data["reason_cat"]= student_data.reason.cat.codes

grade_targets = student_data["G3"]

student_data = student_data.drop(columns=["school", "sex", "address", "famsize", "Pstatus", 
                                          "Mjob", "Fjob", "reason", "guardian", "schoolsup", 
                                          "famsup", "paid", "higher", "internet", "romantic", 
                                          "activities", "nursery", "G3"])


In [39]:
# Split up the data
train_data, test_data, train_targets, test_targets = train_test_split(student_data, grade_targets, test_size = 0.3, shuffle = True)

In [42]:
# Using Regression
regr = KNeighborsRegressor(n_neighbors=3)
regr.fit(train_data, train_targets)
print("The regression classifier for the GRADE PREDICTION achieved", 100 * round(regr.score(test_data, test_targets), 3), "% accuracy")

The regression classifier for the GRADE PREDICTION achieved 79.3 % accuracy
