In [19]:
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.model_selection import train_test_split
import pandas as pd
import imblearn

## upsample classes

In [30]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_train.accuracy_group.value_counts()

3    8845
0    4229
1    2411
2    2205
Name: accuracy_group, dtype: int64

In [16]:
cols_to_drop = ["game_session", "installation_id", "accuracy_group"]

In [17]:
X = reduced_train.drop(cols_to_drop, axis=1)
y = reduced_train.accuracy_group

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [29]:
# given an x_train and y_train, balanced the data


def balance_classes(X_train, y_train):
    """ Balance classes such that all accuracy groups are equally represented"""
    
    X_train['accuracy_group'] = y_train
    train_df = X_train
    
    from sklearn.utils import resample

    # Separate classes
    df_0 = train_df[train_df.accuracy_group == 0]
    df_1 = train_df[train_df.accuracy_group == 1]
    df_2 = train_df[train_df.accuracy_group == 2]
    df_3 = train_df[train_df.accuracy_group == 3]

    # Highest count to upsample towards
    biggest_class = max([x.shape[0] for x in [df_0, df_1, df_2, df_3]])
    resampled_dfs = []
    for i in [df_0, df_1, df_2, df_3]:
        if i.shape[0] != biggest_class:
            upsampled_df = resample(i,
                                      replace=True,  # sample without replacement
                                      n_samples=biggest_class,  # to match majority
                                      random_state=42)  # reproducibility
            
            resampled_dfs.append(upsampled_df)
        else:
            resampled_dfs.append(i) 
            

    balanced_train_df = pd.concat(resampled_dfs, axis=0)

    return balanced_train_df.drop('accuracy_group', axis=1), balanced_train_df.accuracy_group

In [32]:
X_train, y_train = balance_classes(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [33]:
y_train.value_counts()

3    7079
2    7079
1    7079
0    7079
Name: accuracy_group, dtype: int64