## Import ALS Model code

In [171]:
import als_model as am

In [2]:
%matplotlib inline

In [3]:
import cPickle as pickle

## Open cleaned_df

In [4]:
with open ('24OCT17_pipelinerun/cleaned_df.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

## Instantiate Class Instances for ALS Model, Prep and Run for the Following:

#### For each of these features:
- SeriesID
- EventTypeID
- Venue (based on zip code)
- Total Average Fee
- Average Mileage

#### Run best fit model identified from exercise in Pipeline_v2-noTVS notebook on train+validate larger dataset; check rank metric on test set

#### Initial Prep:

In [5]:
als_model = am.implicit_als(cleaned_data, split_val=False)
#setting split_val to False results in the split being train/test instead of train/val/test
#80% of data is in train and 20% is in test

In [6]:
als_model.prep_spark_full_df()

In [7]:
als_model.spark_full_df.show(5)

+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|PersonID|EventID|Participated|         Event_Date|SeriesID|EventTypeID|Total_Fee_Avg|Miles2_Avg|Venue_Zip|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|       1|     11|           1|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       2|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       3|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       4|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       5|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
only showing top 5 rows



In [9]:
als_model.train_val_test_split()

TrainVal Size: 740563.0
Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


## SeriesID

In [11]:
als_model.print_train_val_test_info("SeriesID")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


SeriesID in train: 5
SeriesID in validate: 4
SeriesID in test: 5


SeriesID in both train & validate: 4
SeriesID in both train & test: 5


In [13]:
als_model.create_participate_matrices("SeriesID")

In [16]:
als_model.fit_ALS(itemCol="SeriesID", rank=5, alpha=80, coldStartStrategy="drop")

In [28]:
trainval_predictions, trainval_rank, trainval_pop_rank, test_predictions, test_rank, test_pop_rank = als_model.predict_ALS(
    als_model.base_model, "SeriesID")

Trainval predictions includes 740563 valid values and 0 nan values
Test predictions includes 185141 valid values and 0 nan values
Trainval Model Rank = 4.39662539817 and Popular Rank = 26.3401985479
Test Model Rank = 7.40110567146 and Popular Rank = 10.7472903594


In [30]:
als_fit_model_series = als_model.base_model
als_valpreds_pd_series = trainval_predictions
als_testpreds_pd_series = test_predictions

als_fit_model_series.save("26OCT17_finalfitmodels/als_fit_model_series")

with open ("26OCT17_finalfitmodels/als_valpreds_pd_series.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_series, f)
with open ('26OCT17_finalfitmodels/als_testpreds_pd_series.pkl', 'w') as f:
    pickle.dump(als_testpreds_pd_series, f)

In [31]:
#these only need to be saved once today:
als_trainval_df_pd = als_model.trainval.toPandas()
als_test_df_pd = als_model.test.toPandas()

In [32]:
with open ("26OCT17_finalfitmodels/als_trainval_df_pd.pkl", 'w') as f:
    pickle.dump(als_trainval_df_pd, f)
    
with open ("26OCT17_finalfitmodels/als_test_df_pd.pkl", 'w') as f:
    pickle.dump(als_test_df_pd, f)

## EventTypeID

In [33]:
als_model.print_train_val_test_info("EventTypeID")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


EventTypeID in train: 2
EventTypeID in validate: 1
EventTypeID in test: 2


EventTypeID in both train & validate: 1
EventTypeID in both train & test: 2


In [34]:
als_model.create_participate_matrices("EventTypeID")

In [35]:
als_model.fit_ALS(itemCol="EventTypeID", rank=30, alpha=80, coldStartStrategy="drop")

In [36]:
trainval_predictions, trainval_rank, trainval_pop_rank, test_predictions, test_rank, test_pop_rank = als_model.predict_ALS(
    als_model.base_model, "EventTypeID")

Trainval predictions includes 740563 valid values and 0 nan values
Test predictions includes 185141 valid values and 0 nan values
Trainval Model Rank = 0.00683938201483 and Popular Rank = 7.97525559342
Test Model Rank = 0.417176793321 and Popular Rank = 1.91100969766


In [37]:
als_fit_model_eventtype = als_model.base_model
als_valpreds_pd_eventtype = trainval_predictions
als_testpreds_pd_eventtype = test_predictions

als_fit_model_eventtype.save("26OCT17_finalfitmodels/als_fit_model_eventtype")

with open ("26OCT17_finalfitmodels/als_valpreds_pd_eventtype.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_eventtype, f)
with open ("26OCT17_finalfitmodels/als_testpreds_pd_eventtype.pkl", 'w') as f:
    pickle.dump(als_testpreds_pd_eventtype, f)

## Venue (Based on Zipcode)

In [38]:
als_model.print_train_val_test_info("Venue_Zip")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Venue_Zip in train: 24
Venue_Zip in validate: 10
Venue_Zip in test: 15


Venue_Zip in both train & validate: 8
Venue_Zip in both train & test: 15


In [39]:
als_model.create_participate_matrices("Venue_Zip")

In [40]:
als_model.fit_ALS(itemCol="Venue_Zip", rank=5, alpha=80, coldStartStrategy="drop")

In [41]:
trainval_predictions, trainval_rank, trainval_pop_rank, test_predictions, test_rank, test_pop_rank = als_model.predict_ALS(
    als_model.base_model, "Venue_Zip")

Trainval predictions includes 740563 valid values and 0 nan values
Test predictions includes 185141 valid values and 0 nan values
Trainval Model Rank = 6.60417698625 and Popular Rank = 23.1903301914
Test Model Rank = 13.0975333916 and Popular Rank = 31.7683970337


In [42]:
als_fit_model_zipcode = als_model.base_model
als_trainvalpreds_pd_zipcode = trainval_predictions
als_testpreds_pd_zipcode = test_predictions

als_fit_model_zipcode.save("26OCT17_finalfitmodels/als_fit_model_zipcode")

with open ("26OCT17_finalfitmodels/als_trainvalpreds_pd_zipcode.pkl", 'w') as f:
    pickle.dump(als_trainvalpreds_pd_zipcode, f)
with open ("26OCT17_finalfitmodels/als_testpreds_pd_zipcode.pkl", 'w') as f:
    pickle.dump(als_testpreds_pd_zipcode, f)

## Total Average Fee

In [43]:
als_model.print_train_val_test_info("Total_Fee_Avg")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Total_Fee_Avg in train: 20
Total_Fee_Avg in validate: 11
Total_Fee_Avg in test: 12


Total_Fee_Avg in both train & validate: 8
Total_Fee_Avg in both train & test: 8


In [44]:
als_model.create_participate_matrices("Total_Fee_Avg")

In [45]:
als_model.fit_ALS(itemCol="Total_Fee_Avg", rank=5, alpha=80, coldStartStrategy="drop")

In [46]:
trainval_predictions, trainval_rank, trainval_pop_rank, test_predictions, test_rank, test_pop_rank = als_model.predict_ALS(
    als_model.base_model, "Total_Fee_Avg")

Trainval predictions includes 740563 valid values and 0 nan values
Test predictions includes 152849 valid values and 0 nan values
Trainval Model Rank = 5.91329208347 and Popular Rank = 26.4941021858
Test Model Rank = 14.7866753274 and Popular Rank = 24.3897743898


In [47]:
als_fit_model_avgfee = als_model.base_model
als_trainvalpreds_pd_avgfee = trainval_predictions
als_testpreds_pd_avgfee = test_predictions

als_fit_model_avgfee.save("26OCT17_finalfitmodels/als_fit_model_avgfee")

with open ("26OCT17_finalfitmodels/als_trainvalpreds_pd_avgfee.pkl", 'w') as f:
    pickle.dump(als_trainvalpreds_pd_avgfee, f)

with open ("26OCT17_finalfitmodels/als_testpreds_pd_avgfee.pkl", 'w') as f:
    pickle.dump(als_testpreds_pd_avgfee, f)

## Average Mileage

In [48]:
als_model.print_train_val_test_info("Miles2_Avg")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Miles2_Avg in train: 11
Miles2_Avg in validate: 5
Miles2_Avg in test: 10


Miles2_Avg in both train & validate: 5
Miles2_Avg in both train & test: 7


In [49]:
als_model.create_participate_matrices("Miles2_Avg")

In [50]:
als_model.fit_ALS(itemCol="Miles2_Avg", rank=5, alpha=80, coldStartStrategy="drop")

In [51]:
trainval_predictions, trainval_rank, trainval_pop_rank, test_predictions, test_rank, test_pop_rank = als_model.predict_ALS(
    als_model.base_model, "Miles2_Avg")

Trainval predictions includes 740563 valid values and 0 nan values
Test predictions includes 142085 valid values and 0 nan values
Trainval Model Rank = 2.29642206311 and Popular Rank = 23.1024124786
Test Model Rank = 9.91724468698 and Popular Rank = 31.4678216147


In [52]:
als_fit_model_avgmile = als_model.base_model
als_trainvalpreds_pd_avgmile = trainval_predictions
als_testpreds_pd_avgmile = test_predictions

als_fit_model_avgmile.save("26OCT17_finalfitmodels/als_fit_model_avgmile")

with open ("26OCT17_finalfitmodels/als_trainvalpreds_pd_avgmile.pkl", 'w') as f:
    pickle.dump(als_trainvalpreds_pd_avgmile, f)
with open ("26OCT17_finalfitmodels/als_testpreds_pd_avgmile.pkl", 'w') as f:
    pickle.dump(als_testpreds_pd_avgmile, f)

## Build Gradient Boost Ensemble Using ALS Models Predictions as Inputs to Determine Weights for Each

In [54]:
from ensemble_helper_functions import load_files, merge_data

#### Build X, y Train Matrices for GB Model

In [55]:
data_df = load_files('26OCT17_finalfitmodels/als_trainval_df_pd.pkl')
fee_preds = load_files('26OCT17_finalfitmodels/als_trainvalpreds_pd_avgfee.pkl')
mile_preds = load_files('26OCT17_finalfitmodels/als_trainvalpreds_pd_avgmile.pkl')
type_preds = load_files('26OCT17_finalfitmodels/als_trainvalpreds_pd_eventtype.pkl')
series_preds = load_files('26OCT17_finalfitmodels/als_trainvalpreds_pd_series.pkl')
zipcode_preds = load_files('26OCT17_finalfitmodels/als_trainvalpreds_pd_zipcode.pkl')

In [72]:
gb_data_TEST2 = pd.merge(data_df, series_preds,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'SeriesID'])
#gb_data_TEST2['series_prediction'] = gb_data_TEST2['prediction']
#gb_data_TEST2.drop(['prediction', 'SeriesID'], axis = 1, inplace=True)

In [73]:
gb_data_TEST2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 740563 entries, 0 to 740562
Data columns (total 10 columns):
PersonID         740563 non-null int64
EventID          740563 non-null int64
Participated     740563 non-null int64
Event_Date       740563 non-null int64
SeriesID         740563 non-null float64
EventTypeID      740563 non-null int64
Total_Fee_Avg    740563 non-null int64
Miles2_Avg       740563 non-null int64
Venue_Zip        740563 non-null int64
prediction       738536 non-null float64
dtypes: float64(2), int64(8)
memory usage: 62.2 MB


In [107]:
als_trainval_df_pd[als_trainval_df_pd['Participated'] == 1].count()

PersonID         13462
EventID          13462
Participated     13462
Event_Date       13462
SeriesID         13462
EventTypeID      13462
Total_Fee_Avg    13462
Miles2_Avg       13462
Venue_Zip        13462
dtype: int64

In [172]:
trainval = als_model.trainval.persist()

In [173]:
trainval_pd = trainval.toPandas()

In [174]:
trainval_pd[trainval_pd['Participated'] == 1].count()

PersonID         13425
EventID          13425
Participated     13425
Event_Date       13425
SeriesID         13425
EventTypeID      13425
Total_Fee_Avg    13425
Miles2_Avg       13425
Venue_Zip        13425
dtype: int64

In [176]:
trainval_pd['Participated'].value_counts()

0    727138
1     13425
Name: Participated, dtype: int64

In [175]:
mat_test = trainval[['Participated', 'EventTypeID']]
mat_test_pd = mat_test.toPandas()
mat_test_pd[mat_test_pd['Participated'] == 1].count()

Participated    13498
EventTypeID     13498
dtype: int64

In [177]:
mat_test_pd['Participated'].value_counts()

0    727065
1     13498
Name: Participated, dtype: int64

In [170]:
len(trainval_pd), len(mat_test_pd)

(740563, 740563)

In [136]:
testdf2 = mat_test.distinct().toPandas()

In [137]:
testdf2[testdf2['Participated'] == 1].count()

PersonID        13498
SeriesID        13498
Participated    13498
Event_Date      13498
dtype: int64

In [125]:
testdf = trainval_pd[['PersonID', 'SeriesID', 'Participated', 'Event_Date']]

In [126]:
testdf[testdf['Participated'] == 1].count()

PersonID        13425
SeriesID        13425
Participated    13425
Event_Date      13425
dtype: int64

In [140]:
zipped_list = zip(trainval_pd['Participated'].values, testdf['Participated'].values)

In [141]:
for index, item in enumerate(zipped_list):
    if item[0] != item[1]:
        print index, item

In [57]:
import pandas as pd

predictions = [(series_preds, 'SeriesID'), (type_preds, 'EventTypeID'), 
               (zipcode_preds, 'Venue_Zip'), (fee_preds, 'Total_Fee_Avg'), 
               (mile_preds, 'Miles2_Avg')]

gb_train_data = data_df.copy()

for item in predictions:
    gb_train_data = merge_data(gb_train_data, item[0], item[1])

In [64]:
for item in predictions:
    print len(item[0])

740563
740563
740563
740563
740563


In [65]:
len(gb_train_data)

740563

In [58]:
gb_train_data.head()

Unnamed: 0,PersonID,EventID,Participated,Event_Date,SeriesID_prediction,EventTypeID_prediction,Venue_Zip_prediction,Total_Fee_Avg_prediction,Miles2_Avg_prediction
0,220,1000,0,1418428800000000000,0.068465,0.063756,0.0,0.161009,0.933387
1,1786,1000,0,1418428800000000000,0.152204,0.036247,0.0,0.664248,0.283761
2,3604,1000,0,1418428800000000000,0.080292,0.036247,0.0,0.07085,0.553726
3,7389,1000,0,1418428800000000000,0.080292,0.036247,0.0,0.28226,0.333645
4,441,1000,0,1418428800000000000,0.068284,0.036247,0.0,0.07045,0.840519


In [59]:
gb_train_data.sort_values(by='Event_Date', axis=0, ascending=True, inplace=True)

In [61]:
gb_train_data.reset_index(drop=True, inplace=True)

In [63]:
gb_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740563 entries, 0 to 740562
Data columns (total 9 columns):
PersonID                    740563 non-null int64
EventID                     740563 non-null int64
Participated                740563 non-null int64
Event_Date                  740563 non-null int64
SeriesID_prediction         738536 non-null float64
EventTypeID_prediction      738536 non-null float64
Venue_Zip_prediction        738536 non-null float64
Total_Fee_Avg_prediction    738536 non-null float64
Miles2_Avg_prediction       738536 non-null float64
dtypes: float64(5), int64(4)
memory usage: 50.9 MB


In [85]:
#need to fill nan values for venue_zip_prediction and fee_prediction
gb_data['venuezip_prediction'] = gb_data['venuezip_prediction'].apply(
                            lambda x: gb_data['venuezip_prediction'].mean() if pd.isnull(x) else x)
#need to fill nan values for venue_zip_prediction and fee_prediction
gb_data['fee_prediction'] = gb_data['fee_prediction'].apply(
                            lambda x: gb_data['fee_prediction'].mean() if pd.isnull(x) else x)

In [86]:
gb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148113 entries, 0 to 148112
Data columns (total 9 columns):
PersonID                148113 non-null int64
EventID                 148113 non-null int64
Participated            148113 non-null int64
Event_Date              148113 non-null int64
series_prediction       148113 non-null float64
eventtype_prediction    148113 non-null float64
venuezip_prediction     148113 non-null float64
fee_prediction          148113 non-null float64
miles_prediction        148113 non-null float64
dtypes: float64(5), int64(4)
memory usage: 10.2 MB


In [87]:
len(gb_data)*.8, len(gb_data)*.2

(118490.40000000001, 29622.600000000002)

Split data on first 118491 records for train (sorted by event_date as timestamp) and last 29622 records for test

In [88]:
X_gb_train = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[:118491].copy()

In [95]:
y_gb_train = gb_data['Participated'].iloc[:118491].copy()

In [96]:
len(X_gb_train), len(y_gb_train)

(118491, 118491)

In [97]:
X_gb_test = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[118491:].copy()

In [98]:
y_gb_test = gb_data['Participated'].iloc[118491:].copy()

In [99]:
len(X_gb_test), len(y_gb_test)

(29622, 29622)

In [100]:
with open('25OCT17_repeatablefeaturesrun/gb_data.pkl', 'w') as f:
    pickle.dump(gb_data, f)

## Train Gradient Boosted Regressor out of the box using prepared data

In [3]:
with open('25OCT17_repeatablefeaturesrun/gb_data.pkl', 'rb') as f:
    gb_data = pickle.load(f)

In [4]:
X_gb_train = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[:118491].copy()

In [5]:
y_gb_train = gb_data['Participated'].iloc[:118491].copy()

In [6]:
len(X_gb_train), len(y_gb_train)

(118491, 118491)

In [7]:
X_gb_test = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[118491:].copy()

In [8]:
y_gb_test = gb_data['Participated'].iloc[118491:].copy()

In [9]:
len(X_gb_test), len(y_gb_test)

(29622, 29622)

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

In [11]:
gb_model = GradientBoostingRegressor()

In [12]:
gb_model.fit(X_gb_train, y_gb_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [13]:
gb_model.score(X_gb_test, y_gb_test) #this uses r^2 for scoring....not helpful in my situation

0.028644274749018983

In [14]:
import rank_eval

The df needs to include columns for user, item, actual outcome, and predicted outcome
    user, item, actual, prediction - strings indicating the name of each column in
    the predictions file for use in the function

In [15]:
gb_predictions = gb_model.predict(X_gb_test)

In [16]:
gb_data.tail()

Unnamed: 0,PersonID,EventID,Participated,Event_Date,series_prediction,eventtype_prediction,venuezip_prediction,fee_prediction,miles_prediction
148108,3740,67,0,1486252800000000000,0.963359,0.994047,0.147309,0.639008,0.740542
148109,7650,67,0,1486252800000000000,0.0,0.0,0.0,0.0,0.0
148110,458,67,0,1486252800000000000,0.228323,0.994047,0.147309,0.14781,0.740542
148111,7643,67,0,1486252800000000000,0.0,0.0,0.0,0.0,0.0
148112,4695,67,0,1486252800000000000,0.215541,0.017411,0.0,0.336714,0.227744


In [17]:
X_gb_test.tail()

Unnamed: 0,series_prediction,eventtype_prediction,venuezip_prediction,fee_prediction,miles_prediction
148108,0.963359,0.994047,0.147309,0.639008,0.740542
148109,0.0,0.0,0.0,0.0,0.0
148110,0.228323,0.994047,0.147309,0.14781,0.740542
148111,0.0,0.0,0.0,0.0,0.0
148112,0.215541,0.017411,0.0,0.336714,0.227744


In [19]:
import pandas as pd
rankeval_testpreds_df = pd.DataFrame({'PersonID': gb_data['PersonID'].iloc[118491:], 'EventID': 
                                      gb_data['EventID'].iloc[118491:], 'Participated': y_gb_test.values, 
                                      'prediction': gb_predictions})

In [20]:
rankeval_testpreds_df.head()

Unnamed: 0,EventID,Participated,PersonID,prediction
118491,64,0,3501,0.004708
118492,64,0,3499,0.004708
118493,64,0,3498,0.004708
118494,64,0,3497,0.004708
118495,64,0,3514,0.001085


In [21]:
rank_processing = rank_eval.RankEval(rankeval_testpreds_df, 'PersonID', 'EventID', 'Participated', 'prediction')

In [22]:
rank_processing.calc_test_rank()

6.7685320254790451

In [23]:
rank_processing.calc_popular_rank()

30.009633911368006

In [24]:
gb_model.get_params()

{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [25]:
zip(gb_model.feature_importances_, X_gb_train.columns.values)

[(0.26978860047907643, 'series_prediction'),
 (0.23687755921485665, 'eventtype_prediction'),
 (0.21206301581585485, 'venuezip_prediction'),
 (0.12285071491471172, 'fee_prediction'),
 (0.15842010957550032, 'miles_prediction')]

## Next Steps:
- Alter Code to Support and Rerun Each ALS with Train + Validate Data for Final Fitted Model & Get Predictions on larger dataset; run another test on test data held out previously
- Rerun gradient boosted model using train+val larger dataset to train and test set to test
- Write code to look at events recommended most often in top 10 recommendations DONE
- Write code to look at the users most likely to be interested in a given event (get list of users for which event is listed in their top 10 recommendations) DONE
- Make graphs for presentation showing how I fit my ALS models and the value of the rank eval metric

In [48]:
from collections import defaultdict

D_topeventcounts = defaultdict(int)
D_likelyusers = defaultdict(list)

for person in rankeval_testpreds_df['PersonID'].unique():
    temp_df = rankeval_testpreds_df[rankeval_testpreds_df["PersonID"] == person].copy()
    temp_df.sort_values('prediction', ascending=False, inplace=True)
    temp_df = temp_df.iloc[:10]
    
    for event in temp_df['EventID'].values:
        D_topeventcounts[event] += 1
        D_likelyusers[event].append(person)

In [187]:
a = cleaned_data.sort_values(by='Event_Date', ascending=True).iloc[:int(round(len(cleaned_data)*(1-.2)))]

In [189]:
b = cleaned_data.sort_values(by='Event_Date', ascending=True).iloc[int(round(len(cleaned_data)*(1-.2))):]

In [190]:
len(a), len(b)

(13582, 3396)

In [191]:
len(a) + len(b)

16978

In [None]:
cleaned_data.join