In [371]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option("display.max_rows", 999)
import numpy as np
from IPython.display import display

def read(fp):
    """ Reads a file into a dataframe"""
    df = pd.read_csv(fp)
    return df

people = read("data/people.csv")
activities_train = read("data/act_train.csv")
activities_test = read("data/act_test.csv")
print("Successfully loaded datasets!")

Successfully loaded datasets!


### Explore People

The first 10 characteristics of each person are nominal features that need to be encoded into numerical format for further analysis.  The rest of the characteristics are booleans, which can then be represented as either 0 or 1. Note that **`char_38`** is the only integer value, which will need to be normalized to not carry more weight than the other columns.  

In [2]:
# shape 
print("People df has {} rows and {} columns".format(people.shape[0], people.shape[1]))

# display first five people file
print(display(people.head()))

People df has 189118 rows and 41 columns


Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,type 2,type 4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,type 2,type 2,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,type 2,type 2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,84


None


In [396]:
people.dtypes

people_id    object
char_1       object
group_1      object
char_2       object
date         object
char_3       object
char_4       object
char_5       object
char_6       object
char_7       object
char_8       object
char_9       object
char_10        bool
char_11        bool
char_12        bool
char_13        bool
char_14        bool
char_15        bool
char_16        bool
char_17        bool
char_18        bool
char_19        bool
char_20        bool
char_21        bool
char_22        bool
char_23        bool
char_24        bool
char_25        bool
char_26        bool
char_27        bool
char_28        bool
char_29        bool
char_30        bool
char_31        bool
char_32        bool
char_33        bool
char_34        bool
char_35        bool
char_36        bool
char_37        bool
char_38       int64
dtype: object

#### Groups

There is a large number of people in group 17314.

In [4]:
print "# of people in each group:\n", people["group_1"].value_counts()[:10]                                                     

# of people in each group:
group 17304    77314
group 667       1538
group 8386      1046
group 9280       666
group 450        659
group 1482       484
group 15723      461
group 3229       423
group 17899      414
group 3598       365
Name: group_1, dtype: int64


### Explore Activities

There are nine characteristics associated with activities that are type 1.  Other activity categories do not contain these characteristics, but will contain char_10 information; type 1 does not contain any char_10 info.

In [496]:
print("Activities train df has {} rows and {} columns".format(activities_train.shape[0], activities_train.shape[1]))

# A look into type 1 activities
type_one_activity = display(activities_train[activities_train["activity_category"] == "type 1"][:5])
print("Activities test df has {} rows and {} columns".format(activities_test.shape[0], activities_test.shape[1]))


Activities train df has 2197291 rows and 15 columns


Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
52,ppl_100025,act1_9923,2022-11-25,type 1,type 3,type 5,type 1,type 1,type 6,type 3,type 3,type 6,type 8,,0
105,ppl_100033,act1_198174,2022-07-26,type 1,type 36,type 11,type 5,type 1,type 6,type 1,type 1,type 4,type 1,,0
106,ppl_100033,act1_214090,2023-06-15,type 1,type 24,type 6,type 6,type 3,type 1,type 3,type 4,type 5,type 1,,0
107,ppl_100033,act1_230588,2023-02-28,type 1,type 2,type 2,type 3,type 3,type 5,type 2,type 2,type 4,type 2,,0
108,ppl_100033,act1_271874,2022-07-26,type 1,type 2,type 5,type 3,type 2,type 6,type 1,type 1,type 6,type 8,,0


Activities test df has 498687 rows and 14 columns


### Process dates

In [448]:
def col_to_datetime(df, column):
    """Converts a column to datetime
    
    Also drops the original datetime column
    """
    df = df.copy()
    df[column] = pd.to_datetime(df[column])
    return df

In [449]:
def process_date(df):
    """Extract year, month, day from datetime column"""
    df = df.copy()
    return (df.assign(year = lambda df: df.date.dt.year,
                      month = lambda df: df.date.dt.month,
                      day = lambda df: df.date.dt.day)
                      .drop('date', axis = 1))

### Normalize columns

In [450]:
def normalize(df, column):
    """Normalize a column"""
    df.copy()
    df[column] = df[[column]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df

### Missing values#

In [451]:
def drop(df, columns):
    """Drop column"""
    df = df.copy()
    return df.drop(columns, axis = 1)

### Categorical Variables

In [512]:
def process_cat_variables(df, columns, substring, prefix):
    """Categorical variables are converted into numbers"""
    df = df.copy()
    categorical_columns = [col for col, col_data in df.iteritems() if col_data.dtype == object and substring in col]
    df[categorical_columns] = (df[categorical_columns].fillna('type -999')
                                                      .apply(lambda column: column.str.lstrip(prefix).astype(int)))
    return df

### Booleans

In [487]:
def process_bool_columns(df, columns, substring):
    """Boolean categorical variables are also converted to numbers"""
    df = df.copy()
    bool_columns = [col for col, col_data in df.iteritems() if col_data.dtype == bool and substring in col]
    df[bool_columns] = df[bool_columns].apply(lambda column: column.astype(int))
    return df

### Method Chaining to clean people and activities data

In [513]:
def process_people(df):
    """Perform processing on people data
    
    Dates are transformed to datetime, 
    relevant columns are normalized and 
    categorical variables are encoded
    """
    df = df.copy()
    return (df.pipe(col_to_datetime, "date")
              .pipe(process_date)
              .pipe(normalize, "char_38")
              .pipe(process_cat_variables, df.columns, "char", 'type ')
              .pipe(process_cat_variables, df.columns, "group", "group ")
              .pipe(process_bool_columns, df.columns, "char"))

processed_people = people.pipe(process_people)
processed_people.head(2)

Unnamed: 0,people_id,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,day,month,year
0,ppl_100,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
1,ppl_100002,2,8688,3,28,9,5,3,11,2,4,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0.76,6,1,2021


In [505]:
def process_activities(df):
    """Perform processing on activities data
    
    Dates are transformed to datetime, 
    categorical variables are encoded and
    missing values are replaced
    """
    df = df.copy()
    return (df.pipe(col_to_datetime, "date")
              .pipe(process_date)
              .pipe(process_cat_variables, df.columns, "char")
              .pipe(process_cat_variables, df.columns, "activity_cat")
              .pipe(process_bool_columns, df.columns, "char"))

# preprocess activites
processed_activities_train = activities_train.pipe(process_activities)
processed_activities_test = activities_test.pipe(process_activities)
processed_activities_train.head()

### Join dataframes

Add data from the people dataframe to activity dataframes 

In [514]:
training_data = pd.merge(processed_activities_train, processed_people, on = 'people_id', how = 'left', suffixes=('_activities', '_people'))
test_data = pd.merge(processed_activities_test, processed_people, on = 'people_id', how = 'left', suffixes=('_activities', '_people'))

### Extract class label

In [None]:
# class label
y_train = training_data["outcome"]

# drop unwanted columns
training_data = training_data.pipe(drop, ["outcome"])
cols = ["people_id", "activity_id"]
training_data = training_data.pipe(drop, cols)
test_data = test_data.pipe(drop, cols)
training_data.head()

In [519]:
training_data.head()

Unnamed: 0,activity_category,char_1_activities,char_2_activities,char_3_activities,char_4_activities,char_5_activities,char_6_activities,char_7_activities,char_8_activities,char_9_activities,char_10_activities,day_activities,month_activities,year_activities,char_1_people,group_1,char_2_people,char_3_people,char_4_people,char_5_people,char_6_people,char_7_people,char_8_people,char_9_people,char_10_people,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,day_people,month_people,year_people
0,4,-999,-999,-999,-999,-999,-999,-999,-999,-999,76,26,8,2023,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
1,2,-999,-999,-999,-999,-999,-999,-999,-999,-999,1,27,9,2022,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
2,2,-999,-999,-999,-999,-999,-999,-999,-999,-999,1,27,9,2022,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
3,2,-999,-999,-999,-999,-999,-999,-999,-999,-999,1,4,8,2023,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
4,2,-999,-999,-999,-999,-999,-999,-999,-999,-999,1,26,8,2023,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021


#### Combine Levels

Group several levels based on their frequency distribution.  

In [361]:
def combine_levels(df):
    columns = ["activity_category", "char_10_x",
                     "char_1_y", "char_2_y", "char_3_y", "char_4_y",
                     "char_5_y", "char_6_y", "char_7_y", "char_8_y",
                     "char_9_y"]
    
    for col in columns:
        # strip away unwanted text
        """
        counts = df[col].value_counts()
        max_val = max(counts)
        min_val = min(counts)
        q1 = np.percentile(counts, 25)
        q2 = np.percentile(counts, 50)
        q3 = np.percentile(counts, 75)
        q4 = np.percentile(counts, 90)
        bins = [0, min_val, q1, q2, q3, q4, max_val]
        groups = [1, 2, 3, 4, 5, 6]
        categories = pd.cut(counts, bins, labels=groups)

        counts_dic = dict(counts)
        df[col] = df[col].map(lambda x: (pd.cut([counts_dic[x]], bins, labels = groups)).astype("float"))
        #df['test'] = pd.cut(df[col], bins, labels=groups)"""
    return df

test_data = combine_levels(test_data)
training_data = combine_levels(training_data)

In [365]:
def dummy_features(df):
    columns = ["activity_category", "char_1_x", "char_2_x", 
                     "char_3_x", "char_4_x", "char_5_x", "char_6_x", 
                     "char_7_x", "char_8_x", "char_9_x", "char_10_x",
                     "char_1_y", "char_2_y", "char_3_y", "char_4_y",
                     "char_5_y", "char_6_y", "char_7_y", "char_8_y",
                     "char_9_y", "group_1"]
    
    for col in columns:
        #if col_data.dtype == object:
            #col_data = col_data.replace(['True', 'False'], [1, 0])

        # If still non-numeric, convert to one or more dummy variables
        
        dummies = pd.get_dummies(df[col], prefix=col)

        df = pd.concat([df, dummies], axis = 1) 
    df.drop(columns, inplace = True, axis = 1)
    return df

### Drop columns

In [113]:
activity_ids_test = X_test["activity_id"]

In [114]:
columns = ["people_id", "activity_id", "date"]
X_train = drop(training_data, columns)
X_test = drop(test_data, columns)

### Validation Set

In [156]:
# create validation set
def split(X, y):
    mini_train_size = int(X.shape[0] * .8)
    X_train_mini, X_val = X.iloc[:mini_train_size], X.iloc[mini_train_size:]
    y_train_mini, y_val = y[:mini_train_size], y[mini_train_size:]
    return X_train_mini, X_val, y_train_mini, y_val

X_train_mini, X_val, y_train_mini, y_val = split(X_train, y_train)

### Pipeline transforms with estimator, then get cross val scores

In [166]:
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.cross_validation import cross_val_score

def cross_val(X, y):
    scores = cross_val_score(estimator = pipe_lr,
                                         X = X,
                                         y = y,
                                         cv = 10,
                                         n_jobs = 1)
    print('CV accuracy scores: %s' % scores)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [167]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('lda', LDA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

cross_val(X_train_mini, y_train_mini)

CV accuracy scores: [ 0.72069825  0.98753117  0.89775561  0.57605985  0.715       0.865
  0.83709273  0.76190476  0.77443609  0.64160401]
CV accuracy: 0.778 +/- 0.117


### Metrics

In [175]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def predict(clf, X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    y_pred_train = pipe_lr.predict(X_train)
    y_pred_test = pipe_lr.predict(X_test)
    return y_pred_train, y_pred_test

def calc_metrics(clf, y_pred_train, y_pred_test, y_test):
    # f1 scores
    print "f1 score on training set:", f1_score(y_train_mini, y_pred_train)
    print "f1 score on testing set:", f1_score(y_test, y_pred_test)

    # area under curve
    auc_score = roc_auc_score(y_test, y_pred_test)
    print("Area under the curve:", auc_score)

# validation predictions & metrics
y_pred_train, y_pred_test = predict(pipe_lr, X_train_mini, y_train_mini, X_val)
calc_metrics(pipe_lr, y_pred_train, y_pred_test, y_val)

 f1 score on training set: 0.977975528365
f1 score on testing set: 0.711351351351
('Area under the curve:', 0.74699662223997065)


Unnamed: 0,ppl_char_10,ppl_char_11,ppl_char_12,ppl_char_13,ppl_char_14,ppl_char_15,ppl_char_16,ppl_char_17,ppl_char_18,ppl_char_19,ppl_char_20,ppl_char_21,ppl_char_22,ppl_char_23,ppl_char_24,ppl_char_25,ppl_char_26,ppl_char_27,ppl_char_28,ppl_char_29,ppl_char_30,ppl_char_31,ppl_char_32,ppl_char_33,ppl_char_34,ppl_char_35,ppl_char_36,ppl_char_37,ppl_char_38,activity_category_type 1,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_1_type 1,char_1_type 10,char_1_type 11,char_1_type 12,char_1_type 13,char_1_type 15,char_1_type 16,char_1_type 17,char_1_type 19,char_1_type 2,char_1_type 20,char_1_type 23,char_1_type 24,char_1_type 25,char_1_type 26,char_1_type 29,char_1_type 3,char_1_type 30,char_1_type 36,char_1_type 4,char_1_type 41,char_1_type 5,char_1_type 6,char_1_type 7,char_1_type 8,char_1_type 9,char_2_type 1,char_2_type 10,char_2_type 11,char_2_type 12,char_2_type 13,char_2_type 14,char_2_type 16,char_2_type 17,char_2_type 19,char_2_type 2,char_2_type 25,char_2_type 26,char_2_type 29,char_2_type 3,char_2_type 4,char_2_type 5,char_2_type 6,char_2_type 7,char_2_type 8,char_2_type 9,char_3_type 1,char_3_type 2,char_3_type 3,char_3_type 4,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 1,char_4_type 2,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_5_type 1,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_6_type 1,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_7_type 1,char_7_type 2,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_8_type 1,char_8_type 10,char_8_type 11,char_8_type 12,char_8_type 13,char_8_type 14,char_8_type 15,char_8_type 16,char_8_type 18,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_8_type 9,char_9_type 1,char_9_type 10,char_9_type 12,char_9_type 13,char_9_type 14,char_9_type 15,char_9_type 16,char_9_type 17,char_9_type 18,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,char_10_type 1,char_10_type 103,char_10_type 1033,char_10_type 1038,char_10_type 1058,char_10_type 1065,char_10_type 1069,char_10_type 1070,char_10_type 1073,char_10_type 1089,char_10_type 1097,char_10_type 110,char_10_type 1109,char_10_type 111,char_10_type 1111,char_10_type 1116,char_10_type 114,char_10_type 1146,char_10_type 1154,char_10_type 117,char_10_type 1188,char_10_type 1205,char_10_type 1217,char_10_type 123,char_10_type 1231,char_10_type 1251,char_10_type 1259,char_10_type 1261,char_10_type 1264,char_10_type 1271,char_10_type 1302,char_10_type 1305,char_10_type 131,char_10_type 1313,char_10_type 132,char_10_type 133,char_10_type 1341,char_10_type 1352,char_10_type 137,char_10_type 1370,char_10_type 1372,char_10_type 138,char_10_type 1382,char_10_type 1384,char_10_type 1398,char_10_type 141,char_10_type 143,char_10_type 1441,char_10_type 1451,char_10_type 1485,char_10_type 1496,char_10_type 1501,char_10_type 1502,char_10_type 151,char_10_type 1527,char_10_type 155,char_10_type 156,char_10_type 157,char_10_type 1619,char_10_type 166,char_10_type 167,char_10_type 1674,char_10_type 1675,char_10_type 1676,char_10_type 1716,char_10_type 1727,char_10_type 1771,char_10_type 1772,char_10_type 183,char_10_type 184,char_10_type 1876,char_10_type 19,char_10_type 190,char_10_type 1906,char_10_type 194,char_10_type 1953,char_10_type 1959,char_10_type 197,char_10_type 198,char_10_type 199,char_10_type 2,char_10_type 20,char_10_type 201,char_10_type 2020,char_10_type 2023,char_10_type 203,char_10_type 2032,char_10_type 2037,char_10_type 207,char_10_type 2108,char_10_type 214,char_10_type 2225,char_10_type 2260,char_10_type 227,char_10_type 23,char_10_type 230,char_10_type 2316,char_10_type 2333,char_10_type 237,char_10_type 24,...,ppl_char_7_type 23,ppl_char_7_type 24,ppl_char_7_type 25,ppl_char_7_type 3,ppl_char_7_type 4,ppl_char_7_type 5,ppl_char_7_type 6,ppl_char_7_type 7,ppl_char_7_type 8,ppl_char_7_type 9,ppl_char_8_type 1,ppl_char_8_type 2,ppl_char_8_type 3,ppl_char_8_type 4,ppl_char_8_type 5,ppl_char_8_type 6,ppl_char_8_type 7,ppl_char_8_type 8,ppl_char_9_type 1,ppl_char_9_type 2,ppl_char_9_type 3,ppl_char_9_type 4,ppl_char_9_type 5,ppl_char_9_type 6,ppl_char_9_type 7,ppl_char_9_type 8,ppl_char_9_type 9,group_1_group 10089,group_1_group 10233,group_1_group 1024,group_1_group 10409,group_1_group 10603,group_1_group 10605,group_1_group 10664,group_1_group 1102,group_1_group 11040,group_1_group 11102,group_1_group 11143,group_1_group 11516,group_1_group 1179,group_1_group 11807,group_1_group 11871,group_1_group 1233,group_1_group 12451,group_1_group 12624,group_1_group 1268,group_1_group 12731,group_1_group 12784,group_1_group 1303,group_1_group 13191,group_1_group 1323,group_1_group 13545,group_1_group 13865,group_1_group 1387,group_1_group 1401,group_1_group 14038,group_1_group 14102,group_1_group 14277,group_1_group 14626,group_1_group 14879,group_1_group 1490,group_1_group 15313,group_1_group 15470,group_1_group 15723,group_1_group 15916,group_1_group 16348,group_1_group 1654,group_1_group 16579,group_1_group 16762,group_1_group 16885,group_1_group 16905,group_1_group 17304,group_1_group 17386,group_1_group 17528,group_1_group 17557,group_1_group 17602,group_1_group 17627,group_1_group 17649,group_1_group 17672,group_1_group 17909,group_1_group 18035,group_1_group 18063,group_1_group 18137,group_1_group 18242,group_1_group 18272,group_1_group 18279,group_1_group 18415,group_1_group 18441,group_1_group 18442,group_1_group 18594,group_1_group 18630,group_1_group 18906,group_1_group 18966,group_1_group 1913,group_1_group 19133,group_1_group 19236,group_1_group 19480,group_1_group 19527,group_1_group 19662,group_1_group 1971,group_1_group 19768,group_1_group 19853,group_1_group 20002,group_1_group 20266,group_1_group 20470,group_1_group 21014,group_1_group 21079,group_1_group 21314,group_1_group 2139,group_1_group 21502,group_1_group 21589,group_1_group 21901,group_1_group 21968,group_1_group 21985,group_1_group 22206,group_1_group 22481,group_1_group 22612,group_1_group 22614,group_1_group 22741,group_1_group 22808,group_1_group 2308,group_1_group 23148,group_1_group 2325,group_1_group 2352,group_1_group 2362,group_1_group 23698,group_1_group 23826,group_1_group 2401,group_1_group 24193,group_1_group 24280,group_1_group 2449,group_1_group 249,group_1_group 25169,group_1_group 2538,group_1_group 25842,group_1_group 26217,group_1_group 27538,group_1_group 27862,group_1_group 28414,group_1_group 28984,group_1_group 29362,group_1_group 29437,group_1_group 30182,group_1_group 30844,group_1_group 31187,group_1_group 31542,group_1_group 32343,group_1_group 32519,group_1_group 32870,group_1_group 33262,group_1_group 33592,group_1_group 33595,group_1_group 33699,group_1_group 33913,group_1_group 34775,group_1_group 3493,group_1_group 35053,group_1_group 3537,group_1_group 3538,group_1_group 35931,group_1_group 3598,group_1_group 36096,group_1_group 36905,group_1_group 37134,group_1_group 37488,group_1_group 37633,group_1_group 38420,group_1_group 38523,group_1_group 3873,group_1_group 38745,group_1_group 38766,group_1_group 3886,group_1_group 38924,group_1_group 39163,group_1_group 39166,group_1_group 39309,group_1_group 39664,group_1_group 40246,group_1_group 4146,group_1_group 4152,group_1_group 41627,group_1_group 42007,group_1_group 4204,group_1_group 4289,group_1_group 432,group_1_group 44217,group_1_group 450,group_1_group 4556,group_1_group 45749,group_1_group 46913,group_1_group 46957,group_1_group 4699,group_1_group 4724,group_1_group 4742,group_1_group 4744,group_1_group 476,group_1_group 48070,group_1_group 48532,group_1_group 48590,group_1_group 486,group_1_group 49,group_1_group 49305,group_1_group 5035,group_1_group 5040,group_1_group 50513,group_1_group 50737,group_1_group 5149,group_1_group 5399,group_1_group 5641,group_1_group 5693,group_1_group 5826,group_1_group 584,group_1_group 6153,group_1_group 627,group_1_group 637,group_1_group 6423,group_1_group 649,group_1_group 6534,group_1_group 6668,group_1_group 667,group_1_group 6705,group_1_group 6757,group_1_group 678,group_1_group 6826,group_1_group 6836,group_1_group 7011,group_1_group 7124,group_1_group 7256,group_1_group 7331,group_1_group 7350,group_1_group 7518,group_1_group 7636,group_1_group 768,group_1_group 7737,group_1_group 7743,group_1_group 7936,group_1_group 8006,group_1_group 8541,group_1_group 858,group_1_group 8590,group_1_group 8688,group_1_group 8921,group_1_group 9104,group_1_group 9107,group_1_group 9280,group_1_group 9405,group_1_group 9439,group_1_group 9603,group_1_group 9726,group_1_group 9728
0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
y_pred_train, y_pred_test = predict(pipe_lr, X_train, y_train, X_test)
#output = pd.DataFrame({"predict": y_pred_test})
#output.to_csv("performance/prediction.csv", header=False, index=False)