In [133]:
import pandas as pd
import numpy as np
import os
from sklearn import neural_network
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree, DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score

from datetime import datetime



def to_dummies_daily(data):
    data_dummies = pd.get_dummies(
        data, columns=["Spade card", "Diamond card", "Heart card", "Club card"]
    )
    new_data = pd.DataFrame()
    for index, row in data_dummies.iterrows():
        new_data.at[index, "Date"] = row["Date "]
        new_data.at[index, "weekday"] = row["Date "].weekday()
        new_data.at[index, "month"] = row["Date "].month
        new_data.at[index, "year"] = row["Date "].year
    for i in ["7", "8", "9", "10", "J", "Q", "K", "A"]:
        new_data[i] = (
            data_dummies["Spade card_{}".format(i)]
            | data_dummies["Diamond card_{}".format(i)]
            | data_dummies["Heart card_{}".format(i)]
            | data_dummies["Club card_{}".format(i)]
        )
    new_data.reset_index(drop=True)
    return new_data

def split(new_data):
    X, y, raw_y = split_X_Y_daily(new_data)
    raw_y = pd.DataFrame(raw_y)
    return X, y, raw_y


def split_X_Y_daily(data, frame=1):
    data = data.iloc[::-1].reset_index(drop=True)
    datalen = len(data)
    X = []
    y = []
    raw_y = []
    for index, row in data.iterrows():
        if index > datalen - frame - 1:
            break
        
        # the the prev day values
        Xi = []
        for i in range(0, frame):
            Xi.append(data.iloc[index + i, 1:].values.tolist())
        X.append(Xi)
        
        # get the next day label
        yi = data.iloc[index + frame, 4:].values.tolist()
        # if any of the next 7 days are the same date
        # compute OR between our label and each day label.
        for i in range(0, 7):
            if index + frame + i > datalen - 100:
                break
            if (
                data.loc[index + frame, ["Date"]].values
                == data.loc[index + frame + i, ["Date"]].values
            ):
                yi = np.logical_or(yi, data.iloc[index + frame + i, 4:].values.tolist())
        y.append(yi)
        
        
        # not sure what it's doing.
        np.append(
            data.loc[index + frame, ["weekday"]].values,
            data.loc[index + frame, ["month"]].values,
        )
        
        # current label (with full day label) and date at the end
        raw_yi = yi
        raw_yi = np.multiply(np.array(raw_yi), 1)
        raw_yi = np.append(raw_yi, data.loc[index + frame, ["Date"]].values)

        raw_y.append(raw_yi)
        
    y = np.multiply(np.array(y), 1)
    X = np.array(X)
    return X, y, raw_y


In [70]:
now = datetime.now()
data = pd.read_csv("/Users/serlich/Documents/Notebooks/Chance Prediction/Chance.csv", encoding = 'latin1').drop(columns=['Unnamed: 6'])
data = data.rename(columns={'òìä':'Club card','ìá':'Heart card','éäìåí':'Diamond card','úìúï':'Spade card','äâøìä':"ID",'úàøéê':"Date "})
data["Date "] = pd.to_datetime(data["Date "], dayfirst=True)

In [71]:
data_after_dummies = to_dummies_daily(data)
data_after_dummies.head()
# the encoding: where there is one if the card showed up at this round

Unnamed: 0,Date,weekday,month,year,7,8,9,10,J,Q,K,A
0,2022-07-12,1.0,7.0,2022.0,0,0,0,0,0,0,1,1
1,2022-07-12,1.0,7.0,2022.0,0,1,1,0,0,0,1,1
2,2022-07-12,1.0,7.0,2022.0,0,1,1,1,0,0,0,1
3,2022-07-12,1.0,7.0,2022.0,1,0,1,0,1,0,0,0
4,2022-07-12,1.0,7.0,2022.0,0,1,0,0,0,1,0,1


In [72]:
X, y, raw_y = split(data_after_dummies)
# y's are the encoding
# X's are the day before, all the values exculde the date

In [76]:
assert X.shape[0] == y.shape[0]
assert raw_y.shape[0] == y.shape[0]

In [15]:
# Corr = pd.DataFrame(X)
# print("correlation matrix: ")
# corr_matrix = Corr.corr()
# print(corr_matrix)

In [118]:
from unittest import result
from keras.models import Sequential
from keras.layers import Dense, Activation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor


def random_forest(X, y, X_test, y_test):
    print(f"X: {X.shape}")
    print(f"Y: {y.shape}")
    model = RandomForestRegressor(n_estimators=1000)
    print("--------training random forest-----------")
    model.fit(X, y)
    y_pre = model.predict(X)
    y_pre_round = np.round(y_pre)
    train_accuracy = sklearn.metrics.accuracy_score(y_pre_round, y)
    print("trainning accuracy: ", train_accuracy)
    yhat = model.predict(X_test)
    yhat_round = np.round(yhat)
    test_accuracy = sklearn.metrics.accuracy_score(yhat_round, y_test)
    print("test accuracy: ", test_accuracy)

    return yhat

def transform_to_2_highest_value(data, raw_result):
    columns = list(raw_result.columns)
    result = []
    for index, row in raw_result.iterrows():
        highest = 0
        second = 0
        new_row = []
        highest_card = 0
        second_card = 0
        is_highest_card = "failed to predict"
        is_second_card = "failed to predict"
        for column in columns:
            if column == "Date":
                new_row.append(row[column])
            elif highest < row[column]:
                highest = row[column]
                highest_card = column
        for column in columns:
            if column == "Date":
                continue
            elif row[column] < highest and row[column] >= second:
                second = row[column]
                second_card = column
        data_row = data.loc[index]
        if data_row[highest_card] == 1:
            is_highest_card = "predicted correctly"
        if data_row[second_card] == 1:
            is_second_card = "predicted correctly"
        new_row.extend(
            [
                highest_card,
                highest * 100,
                is_highest_card,
                second_card,
                second * 100,
                is_second_card,
            ]
        )
        result.append(new_row)
    result = pd.DataFrame(
        result,
        columns=[
            "Date",
            "Highest_card",
            "Highest_percentage",
            "Is_highest_card",
            "Second_highest_card",
            "Second_Percentage",
            "is_second_card",
        ],
    )

    return result

def train_daily(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )
    result = pd.DataFrame()
    print("many train: ",y_train.shape[1])
    for i in range(0, y_train.shape[1]):
        result[i] = random_forest(X_train, y_train[:, i], X_test, y_test[:, i])
    return result,y_train.shape[0]


def format_result(result,raw_y):
    result.columns = ["7", "8", "9", "10", "J", "Q", "K", "A"]
    raw_y.columns = ["7", "8", "9", "10", "J", "Q", "K", "A", "Date"]
    result["Date"] = raw_y['Date']
    highest_value_result = transform_to_2_highest_value(raw_y, result)
    return highest_value_result

In [110]:
X_shaped = X.reshape(X.shape[0], X.shape[2])

In [125]:
results,train_size = train_daily(X_shaped, y)

many train:  8
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9749421177573928
test accuracy:  0.7231274176532646
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9752938073327276
test accuracy:  0.6618215918415191
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9756161894434513
test accuracy:  0.7566522095885594
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9759385715541748
test accuracy:  0.7180869769077483
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9748541953635591
test accuracy:  0.7193763919821826
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9758799566249524
test accuracy:  0.729691712577658
X: (34121, 11)
Y: (34121,)
--------training random forest-----------
trainning accuracy:  0.9753524222619501
test accuracy

In [126]:
results.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.573367,0.584,0.765,0.7036,0.524383,0.249,0.359917,0.819667
1,0.535533,0.684667,0.77025,0.9486,0.425167,0.639,0.63075,0.733833
2,0.878333,0.863833,0.923,0.939,0.8019,0.809667,0.7825,0.966
3,0.94425,0.629667,0.9655,0.865833,0.821967,0.678833,0.5724,0.883417
4,0.624533,0.599,0.810167,0.5689,0.743417,0.493643,0.64675,0.688083


In [124]:
raw_y.loc[train_size:, :]

Unnamed: 0,7,8,9,10,J,Q,K,A,Date
34121,1,1,1,1,1,1,1,1,2018-02-13
34122,0,0,1,1,1,1,1,1,2018-02-13
34123,0,0,0,1,0,0,1,1,2018-02-13
34124,1,1,1,1,1,1,1,1,2018-02-14
34125,1,1,1,1,1,1,1,1,2018-02-14
...,...,...,...,...,...,...,...,...,...
42647,0,1,0,0,0,1,0,1,2022-07-12
42648,1,0,1,0,1,0,0,0,2022-07-12
42649,0,1,1,1,0,0,0,1,2022-07-12
42650,0,1,1,0,0,0,1,1,2022-07-12


In [131]:
final_result = format_result(results.copy(),raw_y.loc[train_size:, :].reset_index(drop=True))
final_result.head()

Unnamed: 0,Date,Highest_card,Highest_percentage,Is_highest_card,Second_highest_card,Second_Percentage,is_second_card
0,2018-02-13,A,81.966667,predicted correctly,9,76.5,predicted correctly
1,2018-02-13,10,94.86,predicted correctly,9,77.025,predicted correctly
2,2018-02-13,A,96.6,predicted correctly,10,93.9,predicted correctly
3,2018-02-14,9,96.55,predicted correctly,7,94.425,predicted correctly
4,2018-02-14,9,81.016667,predicted correctly,J,74.341667,predicted correctly


In [132]:
print(
    "result for predict highest value is: ",
    final_result["Is_highest_card"].value_counts(normalize=True).mul(100).astype(str) + "%",
)
print(
    "result for predict second-highest value is: ",
    final_result["is_second_card"].value_counts(normalize=True).mul(100).astype(str) + "%",
)
print("running time: ", (datetime.now() - now))

result for predict highest value is:  predicted correctly    76.27476263040676%
failed to predict      23.72523736959325%
Name: Is_highest_card, dtype: object
result for predict second-highest value is:  predicted correctly    77.49384597350839%
failed to predict      22.50615402649162%
Name: is_second_card, dtype: object
running time:  1:58:09.321972
