# Lecture 10: Classification Part 1

### Classification using scikit-learn (with pandas)

Classification Algorithms covered:
1. k-Nearest Neighbors
2. Decision Trees / Random Forest
3. Logistic Regression

Notebook created by Jennifer Widom, modified by Lisa Wang.

In [None]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.ix[i] is the i-th row of D
f = open('Cities.csv','rU')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.ix[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.ix[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.ix[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print "cold:", len(cities[(cities.category == 'cold')])
print "cool:", len(cities[(cities.category == 'cool')])
print "warm:", len(cities[(cities.category == 'warm')])
print "hot:", len(cities[(cities.category == 'hot')])

In [None]:
# Create training and test sets for cities data
num_items = len(cities)
percent_train = 0.85
num_train = int(numitems*percent_train)
num_test = num_items - num_train
print 'Training set', num_train, 'items'
print'Test set', num_test, 'items'
citiesTrain = cities[0:num_train]
citiesTest = cities[num_train:]

In [None]:
print citiesTrain[:10]

Pandas: Note that you can access individual rows by their row index. E.g.

In [None]:
citiesTrain.ix[0]

In [None]:
print citiesTest[:10]

### K-nearest-neighbors classification

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']

# Create classfier
neighbors = 7 # Number of neighbors to consider for k nearest neighbor classification
classifier = KNeighborsClassifier(n_neighbors=neighbors)

# Train the classifier on training data
classifier.fit(citiesTrain[features], citiesTrain['category'])

# Make predictions on training data
train_predictions = classifier.predict(citiesTrain[features])

# Make predictions on test data
test_predictions = classifier.predict(citiesTest[features])

num_train = len(citiesTrain)
num_test = len(citiesTest)
# Calculate training accuracy
train_correct = 0
for i in range(num_train):
    print 'Predicted:', train_predictions[i], ' Actual:', citiesTrain.ix[i]['category']
    if train_predictions[i] == citiesTrain.ix[i]['category']: train_correct +=1
print 'Training Accuracy:', float(train_correct)/float(num_train)
print ""

# Calculate test accuracy
test_correct = 0
for i in range(num_test):
    print 'Predicted:', test_predictions[i], ' Actual:', citiesTest.ix[num_train + i]['category']
    if test_predictions[i] == citiesTest.ix[num_train + i]['category']: test_correct +=1
print 'Test Accuracy:', float(test_correct)/float(num_test)
# Comment out print, try other values for neighbors, other features

### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [None]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv','rU')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
num_items = len(players)
percent_train = 0.95
num_train = int(num_items*percent_train)
num_test = num_items - num_train
print 'Training set', num_train, 'items'
print'Test set', num_test, 'items'
playersTrain = players[0:num_train]
playersTest = players[num_train:]

In [None]:
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
# Predict a player's position ( playersTrain['position'] ) 

## Decision tree classification

In [None]:
# Predict temperature category from other features
features = ['longitude','latitude']

# Create classifier
split = 10
dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional

# Train the classifier on training data
dt.fit(citiesTrain[features], citiesTrain['category'])

# Make predictions on training data
train_predictions = dt.predict(citiesTrain[features])

# Make predictions on test data
test_predictions = dt.predict(citiesTest[features])

num_train = len(citiesTrain)
num_test = len(citiesTest)
# Calculate training accuracy
train_correct = 0
for i in range(num_train):
#     print 'Predicted:', train_predictions[i], ' Actual:', citiesTrain.ix[i]['category']
    if train_predictions[i] == citiesTrain.ix[i]['category']: train_correct +=1
print 'Training Accuracy:', float(train_correct)/float(num_train)
print ""

# Calculate test accuracy
test_correct = 0
for i in range(num_test):
#     print 'Predicted:', test_predictions[i], ' Actual:', citiesTest.ix[num_train + i]['category']
    if test_predictions[i] == citiesTest.ix[num_train + i]['category']: test_correct +=1
print 'Test Accuracy:', float(test_correct)/float(num_test)

### "Forest" of decision trees

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']

# Create classifier
trees = 10 # Try other values for trees
rf = RandomForestClassifier(n_estimators=trees)

# Train the classifier on training data
rf.fit(citiesTrain[features], citiesTrain['category'])

# Make predictions on training data
train_predictions = rf.predict(citiesTrain[features])

# Make predictions on test data
test_predictions = rf.predict(citiesTest[features])

num_train = len(citiesTrain)
num_test = len(citiesTest)
# Calculate training accuracy
train_correct = 0
for i in range(num_train):
#     print 'Predicted:', train_predictions[i], ' Actual:', citiesTrain.ix[i]['category']
    if train_predictions[i] == citiesTrain.ix[i]['category']: train_correct +=1
print 'Training Accuracy:', float(train_correct)/float(num_train)
print ""

# Calculate test accuracy
test_correct = 0
for i in range(num_test):
#     print 'Predicted:', test_predictions[i], ' Actual:', citiesTest.ix[num_train + i]['category']
    if test_predictions[i] == citiesTest.ix[num_train + i]['category']: test_correct +=1
print 'Test Accuracy:', float(test_correct)/float(num_test)

### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [None]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']

In [None]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']

### Logistic Regression Classification

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']

# Create classifier
lg = LogisticRegression()

# Train the classifier on training data
lg.fit(citiesTrain[features], citiesTrain['category'])

# Make predictions on training data
train_predictions = lg.predict(citiesTrain[features])

# Make predictions on test data
test_predictions = lg.predict(citiesTest[features])

num_train = len(citiesTrain)
num_test = len(citiesTest)
# Calculate training accuracy
train_correct = 0
for i in range(num_train):
#     print 'Predicted:', train_predictions[i], ' Actual:', citiesTrain.ix[i]['category']
    if train_predictions[i] == citiesTrain.ix[i]['category']: train_correct +=1
print 'Training Accuracy:', float(train_correct)/float(num_train)
print ""

# Calculate test accuracy
test_correct = 0
for i in range(num_test):
#     print 'Predicted:', test_predictions[i], ' Actual:', citiesTest.ix[num_train + i]['category']
    if test_predictions[i] == citiesTest.ix[num_train + i]['category']: test_correct +=1
print 'Test Accuracy:', float(test_correct)/float(num_test)

### <font color="green">Your Turn: Logistic Regression on World Cup Data</font>

In [None]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']