In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Load the data
file_path = '/content/drive/MyDrive/Colab Notebooks/DAT/프로젝트/data/league_5_div_HTR_X.csv'
league_df = pd.read_csv(file_path)

In [4]:
league_df

Unnamed: 0,Date,HomeTeam,AwayTeam,HTOa,ATOa,HTAt,ATAt,HTMid,ATMid,HTDef,...,HR,AR,HxG,AxG,HxA,AxA,HxPTS,AxPTS,HPPDA,APPDA
0,2014-09-26,Elche,Celta Vigo,72.0,73.0,72.0,76.0,71.0,74.0,72.0,...,0.0,0.0,0.828,1.536,0.456,0.974,0.844,1.696,12.440,6.458
1,2014-09-27,Atletico Madrid,Sevilla,80.0,78.0,79.0,79.0,80.0,77.0,80.0,...,0.0,0.0,1.500,1.846,1.376,1.082,1.910,2.104,11.820,8.262
2,2014-09-27,Villarreal,Real Madrid,75.0,84.0,77.0,86.0,75.0,85.0,73.0,...,0.0,0.0,1.958,2.104,1.472,1.502,1.710,1.760,13.016,8.634
3,2014-09-27,Athletic Club,Eibar,78.0,71.0,77.0,73.0,79.0,70.0,78.0,...,0.0,0.0,1.002,0.686,0.788,0.552,1.080,1.074,7.822,9.050
4,2014-09-27,Barcelona,Granada,84.0,73.0,87.0,71.0,84.0,73.0,81.0,...,0.2,0.0,2.264,0.552,1.444,0.330,2.544,1.222,6.064,8.726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,2021-09-25,Union Berlin,Arminia Bielefeld,74.0,72.0,77.0,72.0,72.0,71.0,73.0,...,0.2,0.2,1.560,0.902,1.118,0.634,1.762,0.642,16.288,13.936
12544,2021-09-25,Bayer Leverkusen,Mainz 05,78.0,73.0,78.0,72.0,78.0,74.0,74.0,...,0.2,0.0,1.384,1.414,1.140,0.830,1.316,1.930,12.608,11.290
12545,2021-09-25,Hoffenheim,Wolfsburg,76.0,78.0,77.0,80.0,76.0,78.0,75.0,...,0.0,0.0,1.776,1.550,1.512,0.816,1.366,1.914,11.720,8.246
12546,2021-09-25,Eintracht Frankfurt,FC Cologne,77.0,73.0,76.0,74.0,77.0,74.0,76.0,...,0.0,0.2,1.248,1.738,1.018,1.398,1.384,1.376,7.686,7.152


In [6]:
# Normalize the relevant columns
columns_to_normalize = league_df.columns.difference(['Date', 'HomeTeam', 'AwayTeam', 'FTR'])
scaler = StandardScaler()
normalized_data = scaler.fit_transform(league_df[columns_to_normalize])
normalized_df = pd.DataFrame(normalized_data, columns=columns_to_normalize)
normalized_df = pd.concat([league_df[['Date', 'HomeTeam', 'AwayTeam', 'FTR']], normalized_df], axis=1)

# Extracting 'FTR' column
ftr_df = league_df[['FTR']]
normalized_df['index'] = normalized_df.index
ftr_df['index'] = ftr_df.index
merged_df = pd.merge(normalized_df, ftr_df, on='index', suffixes=('', '_y'))
merged_df.drop(['index', 'FTR_y'], axis=1, inplace=True)

# Clean the merged data
cleaned_df = merged_df.drop(columns=['Date', 'HomeTeam', 'AwayTeam'])
ftr_mapping = {'H': 0, 'A': 1, 'D': 2}
cleaned_df['FTR'] = cleaned_df['FTR'].map(ftr_mapping)

# Save the cleaned and processed DataFrame to a CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/DAT/프로젝트/data/cleaned_normalized_league_data.csv'
cleaned_df.to_csv(output_file_path, index=False)

# Load the cleaned data
df = pd.read_csv(output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ftr_df['index'] = ftr_df.index


In [8]:
df.columns

Index(['FTR', 'AC', 'AF', 'APPDA', 'AR', 'AS', 'AST', 'ATAt', 'ATDef', 'ATMid',
       'ATOa', 'AY', 'AwayAvgAge', 'AwayMV', 'AwaySquad', 'AxA', 'AxG',
       'AxPTS', 'FTAG', 'FTHG', 'HC', 'HF', 'HPPDA', 'HR', 'HS', 'HST', 'HTAG',
       'HTAt', 'HTDef', 'HTHG', 'HTMid', 'HTOa', 'HY', 'HomeAvgAge', 'HomeMV',
       'HomeSquad', 'HxA', 'HxG', 'HxPTS'],
      dtype='object')

In [9]:
# Define the features and the target variable
X = df.drop(columns=['FTR'])
y = df['FTR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.5328021248339974
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.80      0.65      1666
           1       0.52      0.51      0.51      1140
           2       0.41      0.09      0.15       959

    accuracy                           0.53      3765
   macro avg       0.49      0.47      0.44      3765
weighted avg       0.51      0.53      0.48      3765

