In [31]:
import pandas as pd
import numpy as np
import joblib
import time
import collections
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, mean_squared_error, log_loss, accuracy_score


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
pd.__version__

'0.24.2'

In [33]:
df = pd.read_csv('TenantInfo-and-usage_shuffled_inf.csv', nrows=100)

In [34]:
df.shape

(100, 228)

In [35]:
df.columns.tolist()

['TenantId',
 'CreatedDate',
 'CreateDateOfFirstSubscription',
 'CountryCode',
 'AllupSeats',
 'EXOSubscriptionsCount',
 'OD4BSubscriptionsCount',
 'SfBSubscriptionsCount',
 'TeamsSubscriptionsCount',
 'PaidCount',
 'ProjectSubscriptionsCount',
 'SPOSubscriptionsCount',
 'ActivatedSubscriptionTotalCount',
 'VisioSubscriptionsCount',
 'TrialSubscriptionsCount',
 'NonTrialSubscriptionsCount',
 'Languange',
 'DataCenterInstance',
 'DataCenterModel',
 'HasEXO',
 'HasSPO',
 'HasOD4B',
 'HasSfB',
 'HasYammer',
 'HasTeams',
 'HasTeamsFreemium',
 'HasKaizala',
 'HasProPlus',
 'HasAADP',
 'HasAIP',
 'HasAATP',
 'HasIntune',
 'HasMCAS',
 'HasWDATP',
 'HasAudioConference',
 'HasPhoneSystem',
 'HasEdiscovery',
 'HasCompliance',
 'HasThreatIntelligence',
 'HasCustomerLockbox',
 'HasOATP',
 'HasAADPP2',
 'HasAIPP2',
 'HasWindows',
 'HasO365CAS',
 'HasCASDiscovery',
 'HasPAM',
 'HasPowerBI',
 'HasPowerBIPremium',
 'HasPowerBIPro',
 'HasVisio',
 'HasProject',
 'HasNonTrial',
 'HasSubscription_AllCount

In [36]:
# df = pd.read_csv('TenantInfo-and-usage_shuffled_inf.csv'#, nrows=200000
#                 )

In [37]:
# print('full data set size {}'.format(df.shape))


# Preprocess data

1. encode categorical data
2. transfer datetime data
3. convert nan to zero, inf to one
4. encode boolean type data
5. split dataset
6. normalize data

In [38]:
cols_name = pd.Series(data=df.columns)

ar_04_beg_col_index = cols_name[cols_name == 'AR_exchange_04'].index[0]
ar_06_beg_col_index = cols_name[cols_name == 'AR_exchange_06'].index[0]
ar_06_end_col_index = cols_name[cols_name == 'AR_officeclient_06'].index[0]

wl_AR_cols = cols_name[ar_04_beg_col_index : ar_06_end_col_index+1].tolist()

In [39]:
output_cols = cols_name[ar_06_beg_col_index : ar_06_end_col_index+1].tolist()


In [40]:
len(output_cols)
# len(wl_AR_cols)

12

In [41]:
df_train = df.loc[df['Age'] >= 360]
df_test = df.loc[df['Age'] < 360]

# df_train.drop(columns='Train', inplace=True)
# df_test.drop(columns='Train', inplace=True)

In [42]:
df_train = df_train.drop('Age', axis=1)
df_test = df_test.drop('Age', axis=1)

In [43]:
print(df_train.shape)
print(df_test.shape)

(80, 227)
(20, 227)


In [44]:
ytrain = df_train.loc[:, output_cols]
ytest = df_test.loc[:, output_cols]

Xtrain = df_train.drop(columns=wl_AR_cols) # use profile only
# Xtrain = df_train.drop(columns=output_cols) # use profile + previous usage
# Xtrain = df_train.loc[:, wl_AR_cols[:-12]] # use previous usage only

Xtest = df_test.drop(columns=wl_AR_cols) # use profile only
# Xtest = df_test.drop(columns=output_cols) # use profile + previous usage
# Xtest = df_test.loc[:, wl_AR_cols[:-12]] # use previous usage only

dev_size = int(Xtrain.shape[0] * 0.2)

Xdev = Xtrain.iloc[-dev_size:,:]
ydev = ytrain.iloc[-dev_size:,:]

Xtrain = Xtrain.iloc[:-dev_size,:]
ytrain = ytrain.iloc[:-dev_size,:]


In [45]:
print('Training size is {}'.format(Xtrain.shape))
print('Dev size is {}'.format(Xdev.shape))
print('Test size is {}'.format(Xtest.shape))

Training size is (64, 191)
Dev size is (16, 191)
Test size is (20, 191)


In [46]:
ytrain = ytrain.to_numpy()
ydev = ydev.to_numpy()
ytest = ytest.to_numpy()
print('Outputs are ready!')

Outputs are ready!


In [47]:
# ytrain

In [48]:
np.save('data/ytrain_fake.npy', ytrain)
np.save('data/ydev_fake.npy', ydev)
np.save('data/ytest_fake.npy', ytest)

print('Saved the outputs targets!')

Saved the outputs targets!


In [49]:
def process_object_cols(df):
    cols_datetime = ['CreatedDate', 'CreateDateOfFirstSubscription','FirstPaidEXOStartDate',
       'FirstPaidSPOStartDate', 'FirstPaidOD4BStartDate',
       'FirstPaidSfBStartDate', #'FirstPaidYammerStartDate',
       'FirstPaidTeamsStartDate', 'FirstPaidProPlusStartDate',
       #'FirstPaidAADPStartDate', 'FirstPaidAIPStartDate',
       #'FirstPaidAATPStartDate', 'FirstPaidIntuneStartDate',
       #'FirstPaidMCASStartDate', 'FirstPaidO365E5SkuStartDate',
       #'FirstPaidM365E5SkuStartDate', 'FirstPaidEMSE5SkuStartDate'
                    ]
    df_datetime = df.loc[:, cols_datetime]
    
    cols_cat = ['CountryCode', 'Languange', #'DataCenterInstance', 'DataCenterModel',
       'SignupLocationInfo_Country', #'SignupLocationInfo_CountryCode',
       #'SignupLocationInfo_Region', 'TopParents_AreaName',
       'TopParents_CountryCode', #'TopParents_BigAreaName', 
       'TopParents_Industry', #'TopParents_RegionName',
       'TopParents_SegmentGroup', #'TopParents_SubRegionName',
       'TopParents_VerticalName']
    df_cat = df.loc[:, cols_cat]
    
    df_tenantid = df.loc[:,'TenantId']
    
    return df_tenantid, df_cat, df_datetime, cols_datetime

In [50]:
Xtrain_id, Xtrain_cat, Xtrain_datetime, _ = process_object_cols(Xtrain)
Xdev_id, Xdev_cat, Xdev_datetime, datetime_cols = process_object_cols(Xdev)
Xtest_id, Xtest_cat, Xtest_datetime, _ = process_object_cols(Xtest)

In [51]:
datetime_cols

['CreatedDate',
 'CreateDateOfFirstSubscription',
 'FirstPaidEXOStartDate',
 'FirstPaidSPOStartDate',
 'FirstPaidOD4BStartDate',
 'FirstPaidSfBStartDate',
 'FirstPaidTeamsStartDate',
 'FirstPaidProPlusStartDate']

In [52]:
Xtrain_cat.shape

(64, 7)

In [53]:
Xtrain_cat_dict

[{'CountryCode': 'AU',
  'Languange': 'en',
  'SignupLocationInfo_Country': 'Australia',
  'TopParents_CountryCode': 'AUS',
  'TopParents_Industry': 'Health Provider',
  'TopParents_SegmentGroup': 'Small, Medium & Corporate Commercial',
  'TopParents_VerticalName': 'Health Provider'},
 {'CountryCode': 'LU',
  'Languange': 'en',
  'SignupLocationInfo_Country': 'Luxembourg',
  'TopParents_CountryCode': 'LUX',
  'TopParents_Industry': '0',
  'TopParents_SegmentGroup': 'Small, Medium & Corporate Commercial',
  'TopParents_VerticalName': 'Unknown'},
 {'CountryCode': 'US',
  'Languange': 'en',
  'SignupLocationInfo_Country': 'United States',
  'TopParents_CountryCode': 'USA',
  'TopParents_Industry': '0',
  'TopParents_SegmentGroup': 'Small, Medium & Corporate Commercial',
  'TopParents_VerticalName': 'Unknown'},
 {'CountryCode': 'IT',
  'Languange': 'it',
  'SignupLocationInfo_Country': 'Italy',
  'TopParents_CountryCode': 'ITA',
  'TopParents_Industry': '0',
  'TopParents_SegmentGroup': 'S

In [54]:
Xtrain_cat_dict = Xtrain_cat.to_dict(orient='records')
Xdev_cat_dict = Xdev_cat.to_dict(orient='records')
Xtest_cat_dict = Xtest_cat.to_dict(orient='records')


In [55]:
dv = DictVectorizer(sparse=False)
Xtrain_cat_encoded = dv.fit_transform(Xtrain_cat_dict)
Xdev_cat_encoded = dv.transform(Xdev_cat_dict)
Xtest_cat_encoded = dv.transform(Xtest_cat_dict)

In [56]:
vocab = dv.vocabulary_
# vocab

In [57]:
# sorted the dictionary by value using OrderedDict
vocab_od = collections.OrderedDict(sorted(vocab.items(), key=lambda x:x[1]))
# print(od)
# OrderedDict([('Orange', 2), ('Apple', 5), ('Mango', 7), ('Banana', 95)])
cat_encoded_cols = list(vocab_od.keys())
# print(cols_cat)
# ['Orange', 'Apple', 'Mango', 'Banana']

In [58]:
len(cols_cat)

NameError: name 'cols_cat' is not defined

In [25]:
# joblib.dump(dv, 'results/vectorizer.pkl')


In [26]:
# vec = joblib.load('results/vectorizer.pkl')
# vec.vocabulary_

In [127]:
Xtrain_cat_encoded.shape

(64, 103)

In [113]:
Xtrain_cat_encoded[:20,:]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [114]:
# encode datetime columns
def encoder_datetime(df):
    cols = df.columns
    for i in cols:
        df[i] = pd.to_datetime(df[i], utc=True, errors='coerce').astype(int,errors='ignore')
    return df

In [115]:
Xtrain_datetime = encoder_datetime(Xtrain_datetime)
Xdev_datetime = encoder_datetime(Xdev_datetime)
Xtest_datetime = encoder_datetime(Xtest_datetime)

In [116]:
Xtrain_datetime = Xtrain_datetime.to_numpy()
Xdev_datetime = Xdev_datetime.to_numpy()
Xtest_datetime = Xtest_datetime.to_numpy()

In [128]:
Xtrain_datetime.shape

(64, 8)

In [129]:
X_num = Xtrain.select_dtypes(include=['float','int'])
X_bool = Xtrain.select_dtypes(include='bool')

In [130]:
X_num.shape

(64, 122)

In [131]:
X_bool.shape

(64, 40)

In [132]:
cols_name = cat_encoded_cols + X_num.columns.to_list() + X_bool.columns.to_list() + cols_datetime
# cols_name.append(X_bool.columns.to_list())
# cols_name.append(X_num.columns.to_list())

In [133]:
len(cols_name)

273

In [28]:
cols_name

0                             TenantId
1                          CreatedDate
2        CreateDateOfFirstSubscription
3                          CountryCode
4                           AllupSeats
5                EXOSubscriptionsCount
6               OD4BSubscriptionsCount
7                SfBSubscriptionsCount
8              TeamsSubscriptionsCount
9                            PaidCount
10           ProjectSubscriptionsCount
11               SPOSubscriptionsCount
12     ActivatedSubscriptionTotalCount
13             VisioSubscriptionsCount
14             TrialSubscriptionsCount
15          NonTrialSubscriptionsCount
16                           Languange
17                  DataCenterInstance
18                     DataCenterModel
19                              HasEXO
20                              HasSPO
21                             HasOD4B
22                              HasSfB
23                           HasYammer
24                            HasTeams
25                    Has

In [29]:
with open('encoded_columns.txt', 'w') as f:
    for item in cols_name:
        f.write("%s\n" % item)

In [30]:
pd.read_csv('encoded_columns.txt', delimiter="\t", header=None)

Unnamed: 0,0
0,TenantId
1,CreatedDate
2,CreateDateOfFirstSubscription
3,CountryCode
4,AllupSeats
5,EXOSubscriptionsCount
6,OD4BSubscriptionsCount
7,SfBSubscriptionsCount
8,TeamsSubscriptionsCount
9,PaidCount


In [121]:
def encoder_num_bool(df):
    X_num = df.select_dtypes(include=['float','int'])
    X_bool = df.select_dtypes(include='bool')
    
    num_cols = X_num.columns.to_list()
    bool_cols = X_bool.columns.to_list()
    
    X_bool = X_bool.astype(int).to_numpy()
    X_num = X_num.to_numpy()
    return X_bool, X_num, num_cols, bool_cols

In [122]:
Xtrain_bool, Xtrain_num, num_cols, bool_cols = encoder_num_bool(Xtrain)
Xdev_bool, Xdev_num, _, _ = encoder_num_bool(Xdev)
Xtest_bool, Xtest_num, _, _ = encoder_num_bool(Xtest) 

In [123]:
def concat_inputs(X_cat_encoded, X_num, X_bool, X_datetime):
    X = np.concatenate((X_cat_encoded, X_num, X_bool, X_datetime), axis=1)
    return X

In [124]:
Xtrain_arr = concat_inputs(Xtrain_cat_encoded, Xtrain_num, Xtrain_bool, Xtrain_datetime)
Xdev_arr = concat_inputs(Xdev_cat_encoded, Xdev_num, Xdev_bool, Xdev_datetime)
Xtest_arr = concat_inputs(Xtest_cat_encoded, Xtest_num, Xtest_bool, Xtest_datetime)

In [83]:
print('After encoding, the training size is {}'.format(Xtrain_arr.shape))
print('After encoding, the dev size is {}'.format(Xdev_arr.shape))
print('After encoding, the test size is {}'.format(Xtest_arr.shape))

After encoding, the training size is (64, 273)
After encoding, the dev size is (16, 273)
After encoding, the test size is (20, 273)


In [31]:
scaler = StandardScaler()
Xtrain_scal = scaler.fit_transform(Xtrain_arr)
Xdev_scal = scaler.transform(Xdev_arr)
Xtest_scal = scaler.transform(Xtest_arr)

In [32]:
np.save('data/Xtrain_fake.npy', Xtrain_scal)
np.save('data/Xdev_fake.npy', Xdev_scal)
np.save('data/Xtest_fake.npy', Xtest_scal)
print('Saved the encoded inputs!')

Saved the encoded inputs!


In [125]:
Xtrain_arr.shape

(64, 273)

## Below is the alternative way to encode the training, dev and test inputs.

In [83]:
def encoder_training_inputs(df_X):
    print('Starting to encode training inputs:')
    X_bool, X_num = encoder_num_bool(df_X)
    df_X_id, df_X_cat, df_X_datetime = process_object_cols(df_X)
    
    id_file_name = df_X_id.name + '_train.csv'
    df_X_id.to_csv(id_file_name, header=False)
    
    X_datetime = encoder_datetime(df_X_datetime)
    X_datetime = X_datetime.to_numpy()
    
    X_cat_dict = df_X_cat.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_cat_encoded = dv.fit_transform(X_cat_dict)
    vocab = dv.vocabulary_
    
    X_arr = np.concatenate((X_cat_encoded, X_num, X_bool, X_datetime), axis=1)
    
    return X_arr, dv, vocab
  

In [81]:
def encoder_dev_test_inputs(df_X, dv, dataset_type):
    # dataset_type is a string, it should be 'dev' or 'test'.
    
    print('Starting to encode dev or test inputs:')
    X_bool, X_num = encoder_num_bool(df_X)
    df_X_id, df_X_cat, df_X_datetime = process_object_cols(df_X)
    
    id_file_name = df_X_id.name + '_' + dataset_type + '.csv'
    df_X_id.to_csv(id_file_name, header=False)
    
    X_datetime = encoder_datetime(df_X_datetime)
    X_datetime = X_datetime.to_numpy()
    
    X_cat_dict = df_X_cat.to_dict(orient='records')
    X_cat_encoded = dv.transform(X_cat_dict)
    
    X_arr = np.concatenate((X_cat_encoded, X_num, X_bool, X_datetime), axis=1)
    
    return X_arr
    
    

In [85]:
t1 = time.time()
Xtrain_arr, dv, vocab = encoder_training_inputs(Xtrain)
t2 = time.time()
print(t2-t1)

Starting the encoding training inputs:
0.16617798805236816


In [86]:
Xdev_arr = encoder_dev_test_inputs(Xdev,dv, 'dev')
Xtest_arr = encoder_dev_test_inputs(Xtest, dv, 'test')

Starting the encoding dev or test inputs:
Starting the encoding dev or test inputs:


In [88]:
scaler = StandardScaler()
Xtrain_scal = scaler.fit_transform(Xtrain_arr)
Xdev_scal = scaler.transform(Xdev_arr)
Xtest_scal = scaler.transform(Xtest_arr)

In [89]:
np.save('data/Xtrain.npy', Xtrain_scal)
np.save('data/Xdev.npy', Xdev_scal)
np.save('data/Xtest.npy', Xtest_scal)
print('Saved the encoded inputs!')

Saved the encoded inputs!
