In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import scipy
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

In [2]:
customer_segment = pd.read_csv('../data/customer_segments.csv')
customer_segment.head()

Unnamed: 0,Business Partner,Customer No.,Partner Type,Data Origin,Title,Marital Status,Occupation,Date of Birth,Death date,Cash /Cashless Type,...,State,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Revenue,RevenueCluster,OverallScore,Segment
0,CS000018,CS000018,Unknown,,Unknown,Unknown,Unknown,,,,...,tamil nadu,CS000018,186,2,1,0,49.75,0,2,1
1,E10007,E10007,Retail,Camp-Outdoor,Unknown,Unknown,Unknown,,,,...,telangana,E10007,1215,0,1,0,1083.77,0,0,0
2,E10013,E10013,Retail,Camp-Outdoor,Unknown,Unknown,Unknown,,,,...,telangana,E10013,1052,0,2,0,1001.29,0,0,0
3,E10013,E10013,Retail,Camp-Outdoor,Unknown,Unknown,Unknown,,,,...,telangana,E10013,1052,0,2,0,1001.29,0,0,0
4,E10014,E10014,Retail,Camp-Outdoor,Mr.,Unknown,Unknown,,,,...,telangana,E10014,1290,0,1,0,785.21,0,0,0


In [5]:
customer_segment.shape

(492314, 59)

In [6]:
customer_segment.columns

Index(['Business Partner', 'Customer No.', 'Partner Type', 'Data Origin',
       'Title', 'Marital Status', 'Occupation', 'Date of Birth', 'Death date',
       'Cash /Cashless Type', 'Claim No.', 'Cust Type', 'Expiry Date',
       'Gate Pass Date', 'Gate Pass Time', 'Insurance Company', 'Invoice Date',
       'Invoice No', 'Invoice Time', 'Job Card No', 'JobCard Date',
       'JobCard Time', 'KMs Reading', 'Labour Total', 'Make', 'Misc Total',
       'Model', 'ODN No.', 'OSL Total', 'Order Type', 'Parts Total',
       'Pin code', 'Plant', 'Plant Name1', 'Policy no.', 'Print Status',
       'Recovrbl Exp', 'Regn No', 'Service Advisor Name', 'Technician Name',
       'Total Amt Wtd Tax.', 'Total Value', 'User ID', 'Invoice Day',
       'Invoice Month', 'Invoice Year', 'Day of the week', 'Division Name',
       'District', 'State', 'CustomerID', 'Recency', 'RecencyCluster',
       'Frequency', 'FrequencyCluster', 'Revenue', 'RevenueCluster',
       'OverallScore', 'Segment'],
      dtype=

In [34]:
customer_segment['Regn No'].isnull().sum()

16

In [35]:
customer_segment = customer_segment[customer_segment['Regn No'].notna()]
customer_segment['Regn No'].isnull().sum()

0

In [50]:
customer_segment.shape

(492298, 59)

In [51]:
customer_regn = customer_segment.groupby('Customer No.')['Regn No'].nunique().reset_index()

customer_regn.columns = ['Customer No.', 'Cars owned']

customer_segment = customer_segment.merge(customer_regn, on='Customer No.')

customer_segment.shape

(492298, 60)

In [54]:
customer_segment.columns

Index(['Business Partner', 'Customer No.', 'Partner Type', 'Data Origin',
       'Title', 'Marital Status', 'Occupation', 'Date of Birth', 'Death date',
       'Cash /Cashless Type', 'Claim No.', 'Cust Type', 'Expiry Date',
       'Gate Pass Date', 'Gate Pass Time', 'Insurance Company', 'Invoice Date',
       'Invoice No', 'Invoice Time', 'Job Card No', 'JobCard Date',
       'JobCard Time', 'KMs Reading', 'Labour Total', 'Make', 'Misc Total',
       'Model', 'ODN No.', 'OSL Total', 'Order Type', 'Parts Total',
       'Pin code', 'Plant', 'Plant Name1', 'Policy no.', 'Print Status',
       'Recovrbl Exp', 'Regn No', 'Service Advisor Name', 'Technician Name',
       'Total Amt Wtd Tax.', 'Total Value', 'User ID', 'Invoice Day',
       'Invoice Month', 'Invoice Year', 'Day of the week', 'Division Name',
       'District', 'State', 'CustomerID', 'Recency', 'RecencyCluster',
       'Frequency', 'FrequencyCluster', 'Revenue', 'RevenueCluster',
       'OverallScore', 'Segment', 'Cars owned']

In [3]:
customer_segment['has_policy'] = customer_segment['Insurance Company'].notnull().astype(int)

customer_segment['has_policy'].value_counts()

0    468723
1     23591
Name: has_policy, dtype: int64

In [4]:
relevant_features = ['Data Origin', 'Cust Type', 'KMs Reading', 'Make',
       'Model','Order Type', 'Plant Name1', 'Total Amt Wtd Tax.', 
                     'District', 'State', 'has_policy', 'Segment']

In [5]:
customer_segment[customer_segment['Segment']==2]['has_policy'].value_counts()

0    5634
1      41
Name: has_policy, dtype: int64

#### Not grouping by customer ID because for every new customer, we are trying to predict the segment of that customer to be low, mid or high. As the customer is coming for the first time, the frequency and the amount they will generate will be low. Hence, for new customers, the model will always predict them to be of the lower segment. 

#### We are keeping the different services by the same customers as they are in the data set and not grouping them, so that the model considers them to be different customers entirely.

In [45]:
customer_model_data = customer_segment[relevant_features]
customer_model_data.head()

Unnamed: 0,Data Origin,Cust Type,KMs Reading,Make,Model,Order Type,Plant Name1,Total Amt Wtd Tax.,District,State,has_policy,Segment
0,,Retail,3000,HONDA,MOBILIO,Mechanical,NERKUNDRAM,49.75,kanchipuram,tamil nadu,1,1
1,Camp-Outdoor,Retail,172979,MAHINDRA & MAHINDRA,BOLERO PIK UP,Running Repairs,BALANAGAR,1083.77,hyderabad,telangana,0,0
2,Camp-Outdoor,Retail,42413,MARUTI SUZUKI,WAGONR,SMC Value Package,BALANAGAR,1001.29,hyderabad,telangana,0,0
3,Camp-Outdoor,Retail,42413,MARUTI SUZUKI,WAGONR,SMC Redemption,BALANAGAR,0.0,hyderabad,telangana,0,0
4,Camp-Outdoor,Retail,79041,MARUTI SUZUKI,WAGONR,Paid Service,MADEENAGUDA,785.21,hyderabad,telangana,0,0


In [46]:
customer_model_data['Segment'].isnull().sum()

0

In [47]:
customer_model_data['Model'].isnull().sum()

1525

In [48]:
#fill null values
customer_model_data['Data Origin'].fillna('Data Origin Unknown', inplace=True)
customer_model_data['Model'].fillna('Model Unknown', inplace=True)

In [27]:
customer_model_data.columns

Index(['Data Origin', 'Cust Type', 'KMs Reading', 'Make', 'Model',
       'Order Type', 'Plant Name1', 'Total Amt Wtd Tax.', 'District', 'State',
       'has_policy', 'Segment'],
      dtype='object')

In [2]:
customer_model_data = pd.read_csv('../data/customer_model_data.csv')

In [None]:
data_origin_enc = pd.get_dummies(customer_model_data['Data Origin'],prefix = 'data_origin_')

cust_type_enc = pd.get_dummies(customer_model_data['Cust Type'],prefix = 'cust_type_')

make_enc = pd.get_dummies(customer_model_data['Make'],prefix = 'make_')

model_enc =  pd.get_dummies(customer_model_data['Model'],prefix = 'model_')

ot_enc =  pd.get_dummies(customer_model_data['Order Type'],prefix = 'ot_')

plant_enc =  pd.get_dummies(customer_model_data['Plant Name1'],prefix = 'plant_')

state_enc =  pd.get_dummies(customer_model_data['State'],prefix = 'state_')

district_enc =  pd.get_dummies(customer_model_data['District'],prefix = 'district_').astype(np.int8)

data_encoded = pd.concat([data_origin_enc, cust_type_enc, make_enc, model_enc, ot_enc, plant_enc, state_enc, district_enc],
                         axis=1)

data_encoded = pd.concat([data_encoded, customer_model_data['KMs Reading'], customer_model_data['Total Amt Wtd Tax.'],
                         customer_model_data['has_policy'], customer_model_data['Segment']], axis=1)

In [None]:
data_encoded.shape

In [23]:
input_data = ['Camp-Outdoor', 'Retail', 3000, 'HONDA', 'MOBILIO', 'Mechanical', 'BALANAGAR', 500, 'hyderabad', 'telangana', 
              '0']
input_df = pd.DataFrame({
    'Data Origin': input_data[0],
    'Cust Type': input_data[1],
    'KMs Reading': input_data[2], 
    'Make': input_data[3], 
    'Model': input_data[4],
    'Order Type': input_data[5], 
    'Plant Name1': input_data[6], 
    'Total Amt Wtd Tax.': input_data[7], 
    'District': input_data[8], 
    'State': input_data[9],
    'has_policy': input_data[10]
}, index=[0])
categorical_cols = ['Data Origin', 'Cust Type', 'Make', 'Model', 'Order Type', 'Plant Name1', 'District', 'State']
input_data_encoded = pd.get_dummies(input_df[categorical_cols])
input_data_encoded[['KMs Reading', 'Total Amt Wtd Tax.', 'has_policy']] = input_df[['KMs Reading', 'Total Amt Wtd Tax.', 'has_policy']]
# Get missing columns in the training test
missing_cols = set( data_encoded.columns ) - set( input_data_encoded.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    input_data_encoded[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
input_data_encoded = input_data_encoded[data_encoded.columns]

In [24]:
input_data_encoded

Unnamed: 0,data_origin__Camp-Outdoor,data_origin__Data Origin Unknown,cust_type__Corporate others,cust_type__Corporate- M&M,cust_type__Fleets,cust_type__Franchise,cust_type__Insurance,cust_type__MFCWL,cust_type__Retail,make__AUDI,...,district__zirakhpur,district__zirakpur,district__zirkapur,district__zirkpur,district__zirkpur mohali,district__zirkpur punjab,KMs Reading,Total Amt Wtd Tax.,has_policy,Segment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3000,500,0,0


In [8]:
customer_model_data.columns

Index(['Data Origin', 'Cust Type', 'KMs Reading', 'Make', 'Model',
       'Order Type', 'Plant Name1', 'Total Amt Wtd Tax.', 'District', 'State',
       'has_policy', 'Segment'],
      dtype='object')

In [11]:
#normalize numerical features
numerica_df = data_encoded[['KMs Reading', 'Total Amt Wtd Tax.']]

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(numerica_df)
df_normalized = pd.DataFrame(np_scaled, columns = ['KMs Reading', 'Total Amt Wtd Tax.'])
data_encoded[['KMs Reading', 'Total Amt Wtd Tax.']] = df_normalized[['KMs Reading', 'Total Amt Wtd Tax.']]
data_encoded[['KMs Reading', 'Total Amt Wtd Tax.']].head() 

Unnamed: 0,KMs Reading,Total Amt Wtd Tax.
0,0.0003,0.000288
1,0.017298,0.001883
2,0.004241,0.001756
3,0.004241,0.000211
4,0.007904,0.001422


In [12]:
X = data_encoded.loc[:, data_encoded.columns != 'Segment']
y = data_encoded['Segment']

In [13]:
X.shape

(492314, 4145)

In [50]:
data_iterator = pd.read_csv('../data/data_encoded.csv',chunksize=10000)

classifier = SGDClassifier(n_jobs=-1,verbose=1)

In [None]:
i=0
for data_set in data_iterator:
    i+=1
    X = data_set.iloc[:,:-1]
    y = data_set['Segment']
    print('Iteration: '+str(i))
    classifier.partial_fit(X,y,classes=[0,1,2])

Iteration: 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 661682.68, NNZs: 198, Bias: -1217.356008, T: 10000, Avg. loss: 7043186477.904287
Total training time: 0.25 seconds.
Norm: 83245.96, NNZs: 240, Bias: 1278.481296, T: 10000, Avg. loss: 11785207931.481983Norm: 120972.08, NNZs: 239, Bias: 171.581329, T: 10000, Avg. loss: 13377103691.427748
Total training time: 0.27 seconds.

Total training time: 0.33 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.2s finished


Iteration: 2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 14126.64, NNZs: 296, Bias: 397.702087, T: 10000, Avg. loss: 2977437579.965934Norm: 14944.32, NNZs: 287, Bias: 1833.411502, T: 10000, Avg. loss: 2744848218.222208Norm: 268249.80, NNZs: 262, Bias: -1876.682957, T: 10000, Avg. loss: 4717383559.694256
Total training time: 0.22 seconds.

Total training time: 0.19 seconds.

Total training time: 0.17 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 313425.76, NNZs: 282, Bias: -2020.468188, T: 10000, Avg. loss: 418562890.933197
Total training time: 0.13 seconds.
Norm: 45241.27, NNZs: 348, Bias: 280.172197, T: 10000, Avg. loss: 984263511.791581
Total training time: 0.19 seconds.
Norm: 27696.49, NNZs: 333, Bias: 2081.327007, T: 10000, Avg. loss: 859136793.295192
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 236978.59, NNZs: 282, Bias: -2020.468188, T: 10000, Avg. loss: 0.000000
Total training time: 0.09 seconds.
Norm: 20731.99, NNZs: 441, Bias: 2243.120461, T: 10000, Avg. loss: 552280566.814884
Total training time: 0.19 seconds.
Norm: 10807.53, NNZs: 460, Bias: 101.564625, T: 10000, Avg. loss: 841645366.752695
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


Iteration: 5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 190511.29, NNZs: 282, Bias: -2020.468188, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 11967.27, NNZs: 561, Bias: 2288.031740, T: 10000, Avg. loss: 734106108.155302
Total training time: 0.20 seconds.
Norm: 23660.74, NNZs: 571, Bias: 59.215557, T: 10000, Avg. loss: 732040607.355331
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Iteration: 6


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 110541.68, NNZs: 327, Bias: -2142.733024, T: 10000, Avg. loss: 483869503.354111
Total training time: 0.14 seconds.
Norm: 3735.95, NNZs: 611, Bias: 2420.677601, T: 10000, Avg. loss: 668455395.478070
Total training time: 0.22 seconds.
Norm: 25913.97, NNZs: 619, Bias: 62.486270, T: 10000, Avg. loss: 645559627.779635
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


Iteration: 7


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 86772.50, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 879230.364890
Total training time: 0.09 seconds.
Norm: 64230.77, NNZs: 671, Bias: 45.623089, T: 10000, Avg. loss: 395833156.888510
Total training time: 0.17 seconds.
Norm: 11075.43, NNZs: 672, Bias: 2459.349503, T: 10000, Avg. loss: 399660985.981680
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 8


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 76059.71, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 8483.97, NNZs: 747, Bias: -21.201247, T: 10000, Avg. loss: 518490192.066788
Total training time: 0.20 seconds.
Norm: 11451.90, NNZs: 743, Bias: 2513.891242, T: 10000, Avg. loss: 409620219.645344
Total training time: 0.22 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


Iteration: 9


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 67701.41, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.13 seconds.
Norm: 1692.98, NNZs: 799, Bias: -54.387067, T: 10000, Avg. loss: 313811110.084696
Total training time: 0.22 seconds.
Norm: 5361.06, NNZs: 787, Bias: 2553.406376, T: 10000, Avg. loss: 347632222.188506
Total training time: 0.22 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


Iteration: 10


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 60998.23, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.12 seconds.
Norm: 10676.60, NNZs: 859, Bias: -101.345729, T: 10000, Avg. loss: 289427261.440541
Total training time: 0.20 seconds.
Norm: 7094.02, NNZs: 846, Bias: 2582.688483, T: 10000, Avg. loss: 303777531.896570
Total training time: 0.27 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


Iteration: 11


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 55502.85, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 6665.77, NNZs: 917, Bias: 2618.165840, T: 10000, Avg. loss: 324379833.849815Norm: 3415.11, NNZs: 930, Bias: -135.652549, T: 10000, Avg. loss: 348546989.989762
Total training time: 0.19 seconds.

Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Iteration: 12


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 50915.80, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.09 seconds.
Norm: 4471.70, NNZs: 982, Bias: -152.131271, T: 10000, Avg. loss: 186728120.815948
Total training time: 0.20 seconds.
Norm: 1638.66, NNZs: 972, Bias: 2629.583139, T: 10000, Avg. loss: 182075184.718242
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 13


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 47029.07, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.09 seconds.
Norm: 10131.44, NNZs: 1043, Bias: 2651.189046, T: 10000, Avg. loss: 287444092.017722
Total training time: 0.19 seconds.
Norm: 5092.54, NNZs: 1037, Bias: -165.094081, T: 10000, Avg. loss: 262509857.111483
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 14


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 43693.65, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.09 seconds.
Norm: 9748.39, NNZs: 1092, Bias: -190.887817, T: 10000, Avg. loss: 180435661.943350
Total training time: 0.19 seconds.
Norm: 7629.63, NNZs: 1088, Bias: 2673.893649, T: 10000, Avg. loss: 141025154.814834
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 40800.01, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 5383.59, NNZs: 1139, Bias: 2683.404045, T: 10000, Avg. loss: 163663395.084022
Total training time: 0.20 seconds.
Norm: 6409.75, NNZs: 1146, Bias: -201.516842, T: 10000, Avg. loss: 145359698.639933
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 16


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 38265.83, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.27 seconds.
Norm: 13224.36, NNZs: 1219, Bias: -203.279476, T: 10000, Avg. loss: 465240099.183364
Total training time: 0.17 seconds.
Norm: 8001.73, NNZs: 1220, Bias: 2692.942421, T: 10000, Avg. loss: 235979625.065140
Total training time: 0.36 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Iteration: 17


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 36028.05, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 4549.94, NNZs: 1282, Bias: -235.671720, T: 10000, Avg. loss: 743511741.964742
Total training time: 0.19 seconds.
Norm: 7427.12, NNZs: 1286, Bias: 2723.679821, T: 10000, Avg. loss: 479061298.448345
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 18


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 34037.54, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 16751.38, NNZs: 1351, Bias: -250.217270, T: 10000, Avg. loss: 242396644.835720
Norm: 4920.44, NNZs: 1342, Bias: 2740.693072, T: 10000, Avg. loss: 255006393.115251Total training time: 0.19 seconds.
Total training time: 0.19 seconds.



[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 19


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 32255.46, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 7086.09, NNZs: 1404, Bias: -271.939179, T: 10000, Avg. loss: 219081981.946832
Total training time: 0.19 seconds.
Norm: 10110.79, NNZs: 1390, Bias: 2768.713463, T: 10000, Avg. loss: 238283006.842370
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 20


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 30650.70, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 8164.38, NNZs: 1446, Bias: -282.942629, T: 10000, Avg. loss: 167678798.096626
Total training time: 0.16 seconds.
Norm: 2959.91, NNZs: 1437, Bias: 2776.322648, T: 10000, Avg. loss: 302558984.232401
Total training time: 0.17 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 21


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 29198.06, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 4673.72, NNZs: 1456, Bias: 2779.246675, T: 10000, Avg. loss: 39271315.029194
Total training time: 0.13 seconds.
Norm: 52984.32, NNZs: 1469, Bias: -284.630098, T: 10000, Avg. loss: 36148867.160558
Total training time: 0.13 seconds.
Iteration: 22


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 50586.83, NNZs: 1469, Bias: -284.630098, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.Norm: 27876.87, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000

Total training time: 0.13 seconds.
Norm: 4462.24, NNZs: 1456, Bias: 2779.246675, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Iteration: 23


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 26670.07, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 904.58, NNZs: 1510, Bias: -303.509447, T: 10000, Avg. loss: 146186765.761358
Total training time: 0.17 seconds.
Norm: 2576.63, NNZs: 1498, Bias: 2791.064198, T: 10000, Avg. loss: 120048803.232884
Total training time: 0.17 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 24


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 25563.43, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.12 seconds.
Norm: 2744.25, NNZs: 1511, Bias: 2793.701978, T: 10000, Avg. loss: 33750799.697487
Total training time: 0.14 seconds.
Norm: 9423.28, NNZs: 1523, Bias: -306.778208, T: 10000, Avg. loss: 35078823.634943
Total training time: 0.14 seconds.
Iteration: 25


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 24544.96, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 3773.36, NNZs: 1521, Bias: 2796.051331, T: 10000, Avg. loss: 35113505.474793
Total training time: 0.14 seconds.
Norm: 5672.49, NNZs: 1531, Bias: -309.288097, T: 10000, Avg. loss: 35536021.278485
Total training time: 0.14 seconds.
Iteration: 26


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 23604.54, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 4335.96, NNZs: 1537, Bias: 2801.736015, T: 10000, Avg. loss: 29417099.662934
Total training time: 0.14 seconds.
Norm: 6457.25, NNZs: 1549, Bias: -315.049009, T: 10000, Avg. loss: 30662235.828330
Total training time: 0.14 seconds.
Iteration: 27


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 22733.52, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 15922.59, NNZs: 1552, Bias: 2805.005150, T: 10000, Avg. loss: 23419812.503054
Norm: 3138.92, NNZs: 1566, Bias: -319.688496, T: 10000, Avg. loss: 24984750.568162Total training time: 0.13 seconds.
Total training time: 0.14 seconds.

Iteration: 28


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 21924.49, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.16 seconds.
Norm: 2537.87, NNZs: 1575, Bias: 2814.894357, T: 10000, Avg. loss: 35008580.393891
Total training time: 0.19 seconds.
Norm: 5056.36, NNZs: 1585, Bias: -328.414106, T: 10000, Avg. loss: 34480919.484609
Total training time: 0.19 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


Iteration: 29


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 21171.07, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 3284.45, NNZs: 1612, Bias: -340.271216, T: 10000, Avg. loss: 53634976.247688
Total training time: 0.14 seconds.
Norm: 4534.95, NNZs: 1604, Bias: 2826.664284, T: 10000, Avg. loss: 55292770.101339
Total training time: 0.16 seconds.
Iteration: 30


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 20467.71, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.12 seconds.
Norm: 1292.87, NNZs: 1634, Bias: -345.599425, T: 10000, Avg. loss: 33038909.235149
Total training time: 0.16 seconds.
Norm: 2888.12, NNZs: 1624, Bias: 2830.578972, T: 10000, Avg. loss: 41061403.462375
Total training time: 0.16 seconds.
Iteration: 31


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 19809.58, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 15485.35, NNZs: 1644, Bias: -353.751131, T: 10000, Avg. loss: 126234801.994490Norm: 9335.26, NNZs: 1626, Bias: 2836.388109, T: 10000, Avg. loss: 34378413.703510
Total training time: 0.14 seconds.

Total training time: 0.14 seconds.
Iteration: 32


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 19192.46, NNZs: 330, Bias: -2143.552666, T: 10000, Avg. loss: 0.000000
Total training time: 0.11 seconds.
Norm: 1956.47, NNZs: 1635, Bias: 2835.882601, T: 10000, Avg. loss: 17259868.667158
Total training time: 0.11 seconds.
Norm: 1232.49, NNZs: 1650, Bias: -354.368733, T: 10000, Avg. loss: 13320502.096312
Total training time: 0.13 seconds.


In [26]:
classifier.predict(input_data_encoded.iloc[:,:-1])

array([1])

In [46]:
def perform_cv(model, X, y):
    f1_scores = []
    X_t, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
#     model.fit(X_t, y_train)
    model_pred = model.predict(X_t)
    print(classification_report(y_train, model_pred))
    f1 = f1_score(y_train, model_pred,average="weighted")
    print("F1 score: "+str(f1))
    f1_scores.append(f1)
    return f1_scores

In [47]:
data_iterator = pd.read_csv('../data/data_encoded.csv',chunksize=100000)

In [49]:
i=0
for data_set in data_iterator:
    i+=1
    X = data_set.iloc[:,:-1]
    y = data_set['Segment']
    print('Iteration: '+str(i))
    perform_cv(classifier, X, y)

KeyboardInterrupt: 