In [216]:
import pandas as pd
import numpy as np
import os
from sklearn import neural_network
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree, DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score

from datetime import datetime


def to_dummies_daily(data):
    data_dummies = pd.get_dummies(
        data, columns=["Spade card", "Diamond card", "Heart card", "Club card"]
    )
    new_data = pd.DataFrame()
    for index, row in data_dummies.iterrows():
        new_data.at[index, "Date"] = row["Date "]
        new_data.at[index, "weekday"] = row["Date "].weekday()
        new_data.at[index, "month"] = row["Date "].month
        new_data.at[index, "year"] = row["Date "].year
    for i in ["7", "8", "9", "10", "J", "Q", "K", "A"]:
        new_data[i] = (
            data_dummies["Spade card_{}".format(i)]
            | data_dummies["Diamond card_{}".format(i)]
            | data_dummies["Heart card_{}".format(i)]
            | data_dummies["Club card_{}".format(i)]
        )
    new_data.reset_index(drop=True)
    return new_data

def split(new_data):
    X, y, raw_y = split_X_Y_daily(new_data)
    raw_y = pd.DataFrame(raw_y)
    return X, y, raw_y


def split_X_Y_daily(data, frame=1):
    data = data.iloc[::-1].reset_index(drop=True)
    datalen = len(data)
    X = []
    y = []
    raw_y = []
    for index, row in data.iterrows():
        if index > datalen - frame - 1:
            break
        
        # the the prev day values
        Xi = []
        for i in range(0, frame):
            Xi.append(data.iloc[index + i, 1:].values.tolist())
        X.append(Xi)
        
        # get the next day label
        yi = data.iloc[index + frame, 4:].values.tolist()
        # if any of the next 7 days are the same date
        # compute OR between our label and each day label.
        for i in range(0, 7):
            if index + frame + i > datalen - 100:
                break
            if (
                data.loc[index + frame, ["Date"]].values
                == data.loc[index + frame + i, ["Date"]].values
            ):
                yi = np.logical_or(yi, data.iloc[index + frame + i, 4:].values.tolist())
        y.append(yi)
        
        
        # not sure what it's doing.
        np.append(
            data.loc[index + frame, ["weekday"]].values,
            data.loc[index + frame, ["month"]].values,
        )
        
        # current label (with full day label) and date at the end
        raw_yi = yi
        raw_yi = np.multiply(np.array(raw_yi), 1)
        raw_yi = np.append(raw_yi, data.loc[index + frame, ["Date"]].values)

        raw_y.append(raw_yi)
        
    y = np.multiply(np.array(y), 1)
    X = np.array(X)
    return X, y, raw_y


In [217]:
test_data = pd.read_csv("test_chance.csv",index_col=[0]).rename(columns={'Clubs':'Club card','Hearts':'Heart card','Diamonds':'Diamond card','Spades':'Spade card','lottoryNumber':"ID",'Date':"Date "})
test_data['Date '] = pd.to_datetime(test_data['Date ']).dt.date

train_data = pd.read_csv('train_chance.csv',index_col=[0]).rename(columns={'Clubs':'Club card','Hearts':'Heart card','Diamonds':'Diamond card','Spades':'Spade card','lottoryNumber':"ID",'Date':"Date "})
train_data['Date '] = pd.to_datetime(train_data['Date ']).dt.date

train_data.head()

Unnamed: 0,Date,ID,Spade card,Diamond card,Heart card,Club card
0,2001-01-01,6820,A,10,8,Q
1,2001-01-01,6821,10,10,7,9
2,2001-01-01,6822,J,10,J,10
3,2001-01-01,6823,J,K,Q,A
4,2001-02-01,6824,K,10,9,K


In [221]:
train_data_after_dummies = to_dummies_daily(train_data)
train_data_after_dummies.head()
# the encoding: where there is one if the card showed up at this round

Unnamed: 0,Date,weekday,month,year,7,8,9,10,J,Q,K,A
0,2001-01-01,0.0,1.0,2001.0,0,1,0,1,0,1,0,1
1,2001-01-01,0.0,1.0,2001.0,1,0,1,1,0,0,0,0
2,2001-01-01,0.0,1.0,2001.0,0,0,0,1,1,0,0,0
3,2001-01-01,0.0,1.0,2001.0,0,0,0,0,1,1,1,1
4,2001-02-01,3.0,2.0,2001.0,0,0,1,1,0,0,1,0


In [220]:
test_data_after_dummies = to_dummies_daily(test_data)
test_data_after_dummies.head()

Unnamed: 0,Date,weekday,month,year,7,8,9,10,J,Q,K,A
29124,2017-07-16,6.0,7.0,2017.0,1,0,0,0,1,1,0,0
29125,2017-07-16,6.0,7.0,2017.0,0,0,1,0,0,1,1,0
29126,2017-07-16,6.0,7.0,2017.0,0,0,1,0,0,1,1,0
29127,2017-07-16,6.0,7.0,2017.0,0,1,1,1,0,1,0,0
29128,2017-07-17,0.0,7.0,2017.0,0,1,0,0,0,1,1,0


In [222]:
X_train, y_train, _ = split(train_data_after_dummies)
X_test, y_test, _ = split(test_data_after_dummies)
# y's are the encoding
# X's are the day before, all the values exculde the date

In [88]:
#assert X.shape[0] == y.shape[0]
#assert raw_y.shape[0] == y.shape[0]

In [21]:
# Corr = pd.DataFrame(X)
# print("correlation matrix: ")
# corr_matrix = Corr.corr()
# print(corr_matrix)

In [233]:
from unittest import result
from keras.models import Sequential
from keras.layers import Dense, Activation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

class random_forest:
    
    
    def fit(self,X, y):
        self.models = list()
        self.train_size = y_train.shape[0]
        self.num_of_model = y.shape[1]
        print("many train: ",self.num_of_model)
        for i in range(0, y.shape[1]):
            self.models.append(self.fit_one_model(X, y[:, i]))
        return self

    def fit_one_model(self,X, y):
        print("--------training random forest-----------")
        print(f"X: {X.shape}")
        print(f"Y: {y.shape}")
        model = RandomForestRegressor(n_estimators=1000)
        
        model.fit(X, y)
        y_pre = model.predict(X)
        y_pre_round = np.round(y_pre)
        train_accuracy = sklearn.metrics.accuracy_score(y_pre_round, y)
        print("trainning accuracy: ", train_accuracy)
        print("-----------------------------------------")

        return model

    def predict(self,X):
        results = pd.DataFrame()
        for i in range(0, self.num_of_model):
            results[i] = self.models[i].predict(X)
        
        results.columns = ["7", "8", "9", "10", "J", "Q", "K", "A"][:self.num_of_model]
        return results
        
#     def predict_accuracy(i,X_test):
#         yhat = 
#         yhat_round = np.round(yhat)
#         test_accuracy = sklearn.metrics.accuracy_score(yhat_round, y_test)
#         print("test accuracy: ", test_accuracy)
#         return yhat


# def transform_to_2_highest_value(data, raw_result):
#     columns = list(raw_result.columns)
#     result = []
#     for index, row in raw_result.iterrows():
#         highest = 0
#         second = 0
#         new_row = []
#         highest_card = 0
#         second_card = 0
#         is_highest_card = "failed to predict"
#         is_second_card = "failed to predict"
#         for column in columns:
#             if column == "Date":
#                 new_row.append(row[column])
#             elif highest < row[column]:
#                 highest = row[column]
#                 highest_card = column
#         for column in columns:
#             if column == "Date":
#                 continue
#             elif row[column] < highest and row[column] >= second:
#                 second = row[column]
#                 second_card = column
#         data_row = data.loc[index]
#         if data_row[highest_card] == 1:
#             is_highest_card = "predicted correctly"
#         if data_row[second_card] == 1:
#             is_second_card = "predicted correctly"
#         new_row.extend(
#             [
#                 highest_card,
#                 highest * 100,
#                 is_highest_card,
#                 second_card,
#                 second * 100,
#                 is_second_card,
#             ]
#         )
#         result.append(new_row)
#     result = pd.DataFrame(
#         result,
#         columns=[
#             "Date",
#             "Highest_card",
#             "Highest_percentage",
#             "Is_highest_card",
#             "Second_highest_card",
#             "Second_Percentage",
#             "is_second_card",
#         ],
#     )

#     return result


# def format_result(result,raw_y):
#     result.columns = ["7", "8", "9", "10", "J", "Q", "K", "A"]
#     raw_y.columns = ["7", "8", "9", "10", "J", "Q", "K", "A", "Date"]
#     result["Date"] = raw_y['Date']
#     highest_value_result = transform_to_2_highest_value(raw_y, result)
#     return highest_value_result

In [234]:
X_train_shaped = X_train.reshape(X_train.shape[0], X_train.shape[2])
X_test_shaped = X_test.reshape(X_test.shape[0], X_test.shape[2])

In [235]:
model = random_forest()
model.fit(X_train_shaped, y_train)

many train:  8
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9754489578683515
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9754489578683515
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9744188442124782
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9756206434776638
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9763073859149126
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning accuracy:  0.9770971397177488
-----------------------------------------
--------training random forest-----------
X: (29123, 11)
Y: (29123,)
trainning 

<__main__.random_forest at 0x7fab3d384f70>

In [236]:
results = model.predict(X_test_shaped)

In [237]:
results.head()

Unnamed: 0,7,8,9,10,J,Q,K,A
0,0.9266,0.8845,0.75465,0.788,0.5098,0.703333,0.722117,0.456
1,0.819417,0.650167,0.562452,0.712,0.6325,0.719333,0.784833,0.503
2,0.815,0.583,0.735,0.3845,0.475,0.8,0.386,0.499
3,0.87,0.823,0.7785,0.738,0.5145,0.30725,0.610667,0.847
4,0.849,0.6015,0.862,0.873667,0.739333,0.886,0.557,0.83


In [238]:
compute_statistics_bins(results.idxmax(axis=1),test_data['Spade card'],4000)


[{'name': UUID('38743a3d-255e-432c-8c26-a73e3b0f3995'),
  'invested': '£400,000.00',
  'single_bet_amount': '£100.00',
  'total_invested': '£400,000.00',
  'total_won': 508,
  'precentage_won': 12.7,
  'expected_won': 12.5,
  'eraned': '£254,000.00',
  'revenue': '-£146,000.00'},
 {'name': UUID('535b8f49-7e8d-412c-a88c-aa0e3ce06c68'),
  'invested': '£400,000.00',
  'single_bet_amount': '£100.00',
  'total_invested': '£400,000.00',
  'total_won': 508,
  'precentage_won': 12.7,
  'expected_won': 12.5,
  'eraned': '£254,000.00',
  'revenue': '-£146,000.00'}]

In [243]:
from chance_utils import compute_statistics

compute_statistics(results.idxmax(axis=1),test_data['Spade card'],name="Spade")


{'name': 'Spade',
 'invested': '£970,800.00',
 'single_bet_amount': '£100.00',
 'total_invested': '£970,800.00',
 'total_won': 1231,
 'precentage_won': 12.680263700041202,
 'expected_won': 12.5,
 'eraned': '£615,500.00',
 'revenue': '-£355,300.00'}

In [244]:
compute_statistics(results.idxmax(axis=1),test_data['Diamond card'],name="Diamond")

{'name': 'Diamond',
 'invested': '£970,800.00',
 'single_bet_amount': '£100.00',
 'total_invested': '£970,800.00',
 'total_won': 1207,
 'precentage_won': 12.433044911413267,
 'expected_won': 12.5,
 'eraned': '£603,500.00',
 'revenue': '-£367,300.00'}

In [245]:
compute_statistics(results.idxmax(axis=1),test_data['Heart card'],name="Heart")

{'name': 'Heart',
 'invested': '£970,800.00',
 'single_bet_amount': '£100.00',
 'total_invested': '£970,800.00',
 'total_won': 1225,
 'precentage_won': 12.618459002884219,
 'expected_won': 12.5,
 'eraned': '£612,500.00',
 'revenue': '-£358,300.00'}

In [246]:
compute_statistics(results.idxmax(axis=1),test_data['Club card'],name="Club")

{'name': 'Club',
 'invested': '£970,800.00',
 'single_bet_amount': '£100.00',
 'total_invested': '£970,800.00',
 'total_won': 1191,
 'precentage_won': 12.26823238566131,
 'expected_won': 12.5,
 'eraned': '£595,500.00',
 'revenue': '-£375,300.00'}