In [115]:
from sklearn.datasets import load_iris
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [116]:
def tryNFeatures(x_train, y_train, x_test, y_test, clf, n):
    selector = RFE(clf, n, step=1)
    selector = selector.fit(x_train, y_train)
    accuracy = accuracy_score(y_test, selector.predict(x_test))
    return selector.support_, accuracy

In [117]:
def findBestFeatures(x_train, x_test, y_train, y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    bestAccuracy = 0
    bestFeatures = None
    for n in range(1, x_train.shape[1]):
        features, accuracy = tryNFeatures(x_train, y_train, x_test, y_test, clf, n)
        #strictly bigger since we preffer less features
        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestFeatures = features
    return bestFeatures

In [176]:
def chooseFeatures(df):
    #split dataframe to x and y
    X, y = getXY(df)

    #split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.2)
    
    #find most important features
    includedFeatures =  findBestFeatures(X_train, X_test, y_train, y_test)
    
    #Add the salespersons to the data model
    filter_col = [col for col in X if col.startswith('SP_')]
    for col in filter_col:
        includedFeatures[X.columns.get_loc(col)]= True
    
    featuresNames = []
    #Return names of includedFeatures
    for i, include in enumerate(includedFeatures):
        if include:
            featuresNames.append(X.columns[i])
    return featuresNames

In [177]:
def getXY(df):
    x = df.loc[:, df.columns != 'Success']
    y = df['Success']
    return x, y

In [178]:
def dropIrrelevantFeatures(df, includedFeatures):
    relevantDf = df[includedFeatures]
    relevantDf['Success'] = df['Success'].values
    return relevantDf

In [185]:
def checkSalespersonsPerLead(test_df, log_reg, salespersonLoc):    
    goodLeadsSalespersons = []
    nrFailed = 0
    # TEST IF WE ALWAYS GET AT LEAST ONE GREEN
    for lead in range(len(test_df)):
        goodSalespersons = []
        for i in salespersonLoc:
            test_df[lead,i] = 0

        test_df_row = test_df[lead,:].reshape(1,-1)
           
        green = False
        for i in salespersonLoc:
            test_df_row[0][i] = 1
            prob = log_reg.predict_proba(test_df_row)[0][1]
            if prob > 0.7:
                goodSalespersons.append(i)
            test_df_row[0][i] = 0
        if len(goodSalespersons) != 0:
            goodLeadsSalespersons.append(goodSalespersons)
        else:
            nrFailed += 1
    return float(nrFailed)/len(test_df), checkDiverse(goodLeadsSalespersons, len(salespersonLoc))

In [186]:
def checkDiverse(goodLeadsSalespersons, nrSalesPersons):    
    return len(set([i[0] for i in goodLeadsSalespersons])) > nrSalesPersons/10

In [187]:
def createModel(df):
    #Choose relevant features
    features = chooseFeatures(df) 
    relevantDf = dropIrrelevantFeatures(df, features)
    
    #Make model
    clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    X, y = getXY(df)
    
    #need some filter here
    percentageFailed, diverse = testModel(relevantDf, clf)

    
    return clf.fit(X, y), relevantDf

In [188]:
def testModel(df, clf):
    #split dataframe to x and y
    X, y = getXY(df)
    
    filter_col = [col for col in X if col.startswith('SP_')]
    
    salespersonLoc = []
    for col in filter_col:
        salespersonLoc.append(X.columns.get_loc(col))
        
    #split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.2)
    log_reg = clf.fit(X_train, y_train)
    
    return checkSalespersonsPerLead(X_test, log_reg, salespersonLoc)
    

In [189]:
datafile = r"/Users/Sunna Halldorsdottir/Desktop/Cornell/Fall/Product Studio/test.xlsx"
model, dataframe = createModel(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
