https://github.com/dbabichenko/python_for_data_and_analytics/tree/master/12%20-%20Classification

In [1]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np
import operator

# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

# To calculate the accuracy score of the model
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def label_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        #le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed

In [3]:
def classify(predictors, response, classifier = 'svm', kern='rbf', neighbors=3, kfolds=0, report=False, features_select=0):        
    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size=0.20, random_state=1)

    
    if classifier == 'nb':
        cl = GaussianNB() # instantiate model
        msg = 'Naive Bayes'
    elif classifier == 'svm':
        cl = svm.SVC(kernel=kern) # instantiate model
        msg = 'SVM with ' + kern + ' kernel'
    elif classifier == 'knn':
        cl = KNeighborsRegressor(n_neighbors=neighbors)
        msg = 'KNN with k=' + str(neighbors)
    elif classifier == 'rf':
        # Instantiate model with  decision trees
        # Note that the number of decision trees is denoted
        # using the neighbors parameter, which is set to 3 by default
        cl = RandomForestRegressor(n_estimators = 1000, random_state = 42)
        msg = 'Random Forest with ' + str(neighbors) + ' decision trees'
    elif classifier == 'dtree':
        cl = DecisionTreeClassifier(min_samples_split=20, random_state=99)
        msg = 'Decision tree'
    elif classifier == 'logisticregression':
        cl = LogisticRegression()
        msg = 'Logistic Regression'
        
    model = cl.fit(X_train, y_train)
    y_pred_class = model.predict(X_test)
    print(msg + ' model accuracy score: ', metrics.accuracy_score(y_test, y_pred_class.round()))
    
    if kfolds > 0:
        # Perform k-fold cross validation
        scores = cross_val_score(model, predictors, response, cv=kfolds)
        print('Cross-validated score:', scores.mean())
    
    if report == True:
        print('Confusion matrix')
        print(metrics.confusion_matrix(y_test, y_pred_class)) # confusion matrix
        print('Classification report')
        target_names=['low','med','high']
        print(metrics.classification_report(y_test, y_pred_class, target_names=target_names))
        
        print('______________________________')
    
    if features_select > 0:
        test = SelectKBest(score_func=chi2, k=features_select)
        fit = test.fit(predictors, response)
        
        # summarize scores
        np.set_printoptions(precision=3)
        # print(fit.scores_)
        features_scores = {}
        for i in range(0, len(predictors.columns)):
            features_scores[predictors.columns[i]] = fit.scores_[i]
            
        # print(features_scores)
        sorted_features_scores = sorted(features_scores.items(), key=operator.itemgetter(1))
        print(sorted_features_scores[-5:])        

In [4]:
def decompose(predictors):
    # feature extraction
    pca = PCA(n_components=3)
    fit = pca.fit(predictors)
    
    # summarize components
    # print(fit.explained_variance_ratio_)
    # print(fit.components_)
    return pd.DataFrame(fit.components_)

In [5]:
df = pd.read_csv("311TrimmedGrouped.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,REQUEST_ID,CREATED_ON,REQUEST_TYPE,REQUEST_ORIGIN,STATUS,DEPARTMENT,NEIGHBORHOOD,COUNCIL_DISTRICT,WARD,...,daily.temperatureMinTime,daily.time,daily.uvIndex,daily.uvIndexTime,daily.visibility,daily.windBearing,daily.windGust,daily.windGustTime,daily.windSpeed,REQUEST_GROUP
0,13,205276,2017-12-31T13:42:00,Potholes,Website,1,DPW - Street Maintenance,Beechview,4.0,19.0,...,1514721600,1514696400,1,1514732400,9.1,261,5.37,1514696400,0.56,Roads
1,75,205180,2017-12-29T17:45:00,Potholes,Website,1,DPW - Street Maintenance,Manchester,6.0,21.0,...,1514541600,1514523600,1,1514559600,8.58,253,8.96,1514577600,1.31,Roads
2,96,205151,2017-12-29T14:58:00,Potholes,Call Center,1,DPW - Street Maintenance,Regent Square,9.0,14.0,...,1514541600,1514523600,1,1514559600,8.28,252,8.98,1514577600,1.26,Roads
3,112,205130,2017-12-29T14:03:00,Potholes,Call Center,1,DPW - Street Maintenance,Brookline,4.0,19.0,...,1514541600,1514523600,1,1514559600,8.59,253,9.22,1514577600,1.3,Roads
4,133,205098,2017-12-29T12:40:00,Potholes,Call Center,1,DPW - Street Maintenance,Lincoln Place,5.0,31.0,...,1514541600,1514523600,1,1514559600,8.32,252,9.37,1514577600,1.29,Roads


In [6]:
df.fillna(0, inplace=True) #fill zero instances of weather
df = df.reset_index()

In [7]:
df.columns

Index(['index', 'Unnamed: 0', 'REQUEST_ID', 'CREATED_ON', 'REQUEST_TYPE',
       'REQUEST_ORIGIN', 'STATUS', 'DEPARTMENT', 'NEIGHBORHOOD',
       'COUNCIL_DISTRICT', 'WARD', 'TRACT', 'PUBLIC_WORKS_DIVISION',
       'PLI_DIVISION', 'POLICE_ZONE', 'FIRE_ZONE', 'X', 'Y', 'GEO_ACCURACY',
       'current.apparentTemperature', 'current.cloudCover', 'current.dewPoint',
       'current.humidity', 'current.icon', 'current.precipIntensity',
       'current.precipProbability', 'current.precipType', 'current.pressure',
       'current.summary', 'current.temperature', 'current.time',
       'current.uvIndex', 'current.visibility', 'current.windBearing',
       'current.windGust', 'current.windSpeed',
       'daily.apparentTemperatureHigh', 'daily.apparentTemperatureHighTime',
       'daily.apparentTemperatureLow', 'daily.apparentTemperatureLowTime',
       'daily.apparentTemperatureMax', 'daily.apparentTemperatureMaxTime',
       'daily.apparentTemperatureMin', 'daily.apparentTemperatureMinTime',
 

In [8]:
# define X and y
X = df[['current.apparentTemperature', 'current.cloudCover', 'current.dewPoint', 'current.humidity',
       'current.icon', 'current.precipIntensity', 'current.precipProbability',
       'current.precipType', 'current.pressure', 'current.summary',
       'current.temperature', 'current.uvIndex',
       'current.visibility', 'current.windBearing', 'current.windGust',
       'current.windSpeed', 'daily.apparentTemperatureHigh',
       'daily.apparentTemperatureLow', 'daily.apparentTemperatureMax',
       'daily.apparentTemperatureMin', 'daily.cloudCover',
       'daily.dewPoint', 'daily.humidity', 'daily.icon', 'daily.moonPhase',
       'daily.precipAccumulation', 'daily.precipIntensity', 'daily.precipIntensityMax', 
       'daily.precipProbability', 'daily.precipType', 'daily.pressure',
       'daily.summary', 'daily.temperatureHigh', 'daily.temperatureLow', 
       'daily.temperatureMax', 'daily.temperatureMin', 'daily.uvIndex', 'daily.visibility',
       'daily.windBearing', 'daily.windGust', 'daily.windSpeed']]
label_encode(X, X.columns.values)

# y = df[['NEIGHBORHOOD']]
# y = df['DEPARTMENT']
y = df[['REQUEST_GROUP']]  

label_encode(y, y.columns.values)
# y = y['REQUEST_GROUP']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [9]:
#classify(X, y, classifier='nb', kfolds=10, report=False, features_select=3)
# classify(X, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
#X_pca = decompose(X)
#classify(X_pca, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
#print(X_pca.shape)
# classify(X, y, classifier='svm', kern='linear', kfolds=10, report=True)
# classify(X, y, classifier='svm', kern='poly', kfolds=10, report=True)
# classify(X, y, classifier='knn', neighbors=3, kfolds=10)
# classify(X, y, classifier='knn', neighbors=5, kfolds=10)
# classify(X, y, classifier='knn', neighbors=7)
# classify(X, y, classifier='rf', neighbors=1000, kfolds=10)
# classify(X, y, classifier='rf', neighbors=2000)
# classify(X, y, classifier='dtree', kfolds=10)
classify(X, y, classifier='logisticregression', kfolds=10)

  y = column_or_1d(y, warn=True)


Logistic Regression model accuracy score:  0.39647991401316673


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validated score: 0.34468697850918806


In [10]:
classify(X, y, classifier='nb', kfolds=10, report=False, features_select=3)

Naive Bayes model accuracy score:  0.25742308209055487


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validated score: 0.2191990204851635
[('current.apparentTemperature', 1313257.9193770203), ('daily.temperatureMax', 1551303.8373217685), ('daily.temperatureHigh', 1616820.509686216), ('daily.apparentTemperatureMax', 1660423.1388360148), ('daily.apparentTemperatureHigh', 1677953.8489082388)]


In [11]:
classify(X, y, classifier='dtree', kfolds=10)

Decision tree model accuracy score:  0.38129786376461106
Cross-validated score: 0.21101278914533678


In [12]:
classify(X, y, classifier='rf', neighbors=2000)



Random Forest with 2000 decision trees model accuracy score:  0.38223834475345964


In [13]:
classify(X, y, classifier='rf', neighbors=1000, kfolds=10)



Random Forest with 1000 decision trees model accuracy score:  0.38223834475345964


  estimator.fit(X_train, y_train, **fit_params)


KeyboardInterrupt: 

In [14]:
classify(X, y, classifier='knn', neighbors=7)

KNN with k=7 model accuracy score:  0.3349455864570738


In [15]:
classify(X, y, classifier='knn', neighbors=5, kfolds=10)

KNN with k=5 model accuracy score:  0.3318554346365713
Cross-validated score: -0.5240302197102495


In [16]:
classify(X, y, classifier='knn', neighbors=3, kfolds=10)

KNN with k=3 model accuracy score:  0.32930270052398225
Cross-validated score: -0.7540960779808408


In [17]:
# classify(X, y, classifier='svm', kern='poly', kfolds=10, report=True)
# classify(X, y, classifier='svm', kern='linear', kfolds=10, report=True)

In [None]:
classify(X, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)

  y = column_or_1d(y, warn=True)


SVM with rbf kernel model accuracy score:  0.3958081418782749


  y = column_or_1d(y, warn=True)


In [None]:
classify(X, y, classifier='svm', kern='ply', report=False, features_select=3)

In [None]:
X_pca = decompose(X)
classify(X_pca, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
print(X_pca.shape)