## K-Value of 22-23

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the range of K values to test
k_values = range(10, 51)

# Read the data from the CSV file
data = pd.read_csv("/Users/ryansung/Downloads/restructured_2223.csv")

# Split the data into training and testing data
train_ratio = 0.8  # Ratio of training data to the total data

# Randomly split the data into training and testing sets
training_data, testing_data = train_test_split(data, train_size=train_ratio, random_state=42)


# Function to calculate Elo ratings
def calculate_elo_ratings(data, k):
    # Initialize Elo ratings for each team
    elo_ratings = {}

    # Iterate over the data to calculate Elo ratings
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Calculate expected probabilities using Elo ratings
        expected_a = 1 / (1 + 10 ** ((elo_ratings.get(team_b, 1500) - elo_ratings.get(team_a, 1500)) / 400))
        expected_b = 1 - expected_a

        # Update Elo ratings based on match outcome
        elo_ratings[team_a] = elo_ratings.get(team_a, 1500) + k * (score_a - expected_a)
        elo_ratings[team_b] = elo_ratings.get(team_b, 1500) + k * (score_b - expected_b)

    return elo_ratings

# Function to simulate matches and calculate accuracy
def simulate_matches(data, elo_ratings):
    total_matches = len(data)
    correct_predictions = 0

    # Iterate over the data to simulate matches and compare with actual outcomes
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Simulate match outcome based on Elo ratings
        simulated_score_a = np.random.poisson(elo_ratings.get(team_a, 1500))
        simulated_score_b = np.random.poisson(elo_ratings.get(team_b, 1500))

        # Check if the simulated outcome matches the actual outcome
        if simulated_score_a > simulated_score_b and score_a > score_b:
            correct_predictions += 1
        elif simulated_score_a < simulated_score_b and score_a < score_b:
            correct_predictions += 1

    # Calculate accuracy rate
    accuracy = correct_predictions / total_matches
    return accuracy

# Loop over each K value and calculate accuracy
best_k = None
best_accuracy = 0

for k in k_values:
    # Calculate Elo ratings using the training data
    elo_ratings = calculate_elo_ratings(training_data, k)

    # Simulate matches and calculate accuracy using the testing data
    accuracy = simulate_matches(testing_data, elo_ratings)

    # Update best K value if accuracy is improved
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Print the best K value and its corresponding accuracy
print(f"Best K value: {best_k}")
print(f"Accuracy: {best_accuracy}")


Best K value: 17
Accuracy: 0.5612244897959183


## K-Value of 21-22

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the range of K values to test
k_values = range(10, 51)

# Read the data from the CSV file
data = pd.read_csv("/Users/ryansung/Downloads/restructured_2122.csv")

# Split the data into training and testing data
train_ratio = 0.8  # Ratio of training data to the total data

# Randomly split the data into training and testing sets
training_data, testing_data = train_test_split(data, train_size=train_ratio, random_state=42)


# Function to calculate Elo ratings
def calculate_elo_ratings(data, k):
    # Initialize Elo ratings for each team
    elo_ratings = {}

    # Iterate over the data to calculate Elo ratings
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Calculate expected probabilities using Elo ratings
        expected_a = 1 / (1 + 10 ** ((elo_ratings.get(team_b, 1500) - elo_ratings.get(team_a, 1500)) / 400))
        expected_b = 1 - expected_a

        # Update Elo ratings based on match outcome
        elo_ratings[team_a] = elo_ratings.get(team_a, 1500) + k * (score_a - expected_a)
        elo_ratings[team_b] = elo_ratings.get(team_b, 1500) + k * (score_b - expected_b)

    return elo_ratings

# Function to simulate matches and calculate accuracy
def simulate_matches(data, elo_ratings):
    total_matches = len(data)
    correct_predictions = 0

    # Iterate over the data to simulate matches and compare with actual outcomes
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Simulate match outcome based on Elo ratings
        simulated_score_a = np.random.poisson(elo_ratings.get(team_a, 1500))
        simulated_score_b = np.random.poisson(elo_ratings.get(team_b, 1500))

        # Check if the simulated outcome matches the actual outcome
        if simulated_score_a > simulated_score_b and score_a > score_b:
            correct_predictions += 1
        elif simulated_score_a < simulated_score_b and score_a < score_b:
            correct_predictions += 1

    # Calculate accuracy rate
    accuracy = correct_predictions / total_matches
    return accuracy

# Loop over each K value and calculate accuracy
best_k = None
best_accuracy = 0

for k in k_values:
    # Calculate Elo ratings using the training data
    elo_ratings = calculate_elo_ratings(training_data, k)

    # Simulate matches and calculate accuracy using the testing data
    accuracy = simulate_matches(testing_data, elo_ratings)

    # Update best K value if accuracy is improved
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Print the best K value and its corresponding accuracy
print(f"Best K value: {best_k}")
print(f"Accuracy: {best_accuracy}")


Best K value: 17
Accuracy: 0.5833333333333334


## K-Value of 20-21

In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the range of K values to test
k_values = range(10, 51)

# Read the data from the CSV file
data = pd.read_csv("/Users/ryansung/Downloads/restructured_2021.csv")

# Split the data into training and testing data
train_ratio = 0.8  # Ratio of training data to the total data

# Randomly split the data into training and testing sets
training_data, testing_data = train_test_split(data, train_size=train_ratio, random_state=42)


# Function to calculate Elo ratings
def calculate_elo_ratings(data, k):
    # Initialize Elo ratings for each team
    elo_ratings = {}

    # Iterate over the data to calculate Elo ratings
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Calculate expected probabilities using Elo ratings
        expected_a = 1 / (1 + 10 ** ((elo_ratings.get(team_b, 1500) - elo_ratings.get(team_a, 1500)) / 400))
        expected_b = 1 - expected_a

        # Update Elo ratings based on match outcome
        elo_ratings[team_a] = elo_ratings.get(team_a, 1500) + k * (score_a - expected_a)
        elo_ratings[team_b] = elo_ratings.get(team_b, 1500) + k * (score_b - expected_b)

    return elo_ratings

# Function to simulate matches and calculate accuracy
def simulate_matches(data, elo_ratings):
    total_matches = len(data)
    correct_predictions = 0

    # Iterate over the data to simulate matches and compare with actual outcomes
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Simulate match outcome based on Elo ratings
        simulated_score_a = np.random.poisson(elo_ratings.get(team_a, 1500))
        simulated_score_b = np.random.poisson(elo_ratings.get(team_b, 1500))

        # Check if the simulated outcome matches the actual outcome
        if simulated_score_a > simulated_score_b and score_a > score_b:
            correct_predictions += 1
        elif simulated_score_a < simulated_score_b and score_a < score_b:
            correct_predictions += 1

    # Calculate accuracy rate
    accuracy = correct_predictions / total_matches
    return accuracy

# Loop over each K value and calculate accuracy
best_k = None
best_accuracy = 0

for k in k_values:
    # Calculate Elo ratings using the training data
    elo_ratings = calculate_elo_ratings(training_data, k)

    # Simulate matches and calculate accuracy using the testing data
    accuracy = simulate_matches(testing_data, elo_ratings)

    # Update best K value if accuracy is improved
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Print the best K value and its corresponding accuracy
print(f"Best K value: {best_k}")
print(f"Accuracy: {best_accuracy}")


Best K value: 12
Accuracy: 0.5747126436781609


## K-value of 19-20

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the range of K values to test
k_values = range(10, 51)

# Read the data from the CSV file
data = pd.read_csv("/Users/ryansung/Downloads/restructured_1920.csv")

# Split the data into training and testing data
train_ratio = 0.8  # Ratio of training data to the total data

# Randomly split the data into training and testing sets
training_data, testing_data = train_test_split(data, train_size=train_ratio, random_state=42)


# Function to calculate Elo ratings
def calculate_elo_ratings(data, k):
    # Initialize Elo ratings for each team
    elo_ratings = {}

    # Iterate over the data to calculate Elo ratings
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Calculate expected probabilities using Elo ratings
        expected_a = 1 / (1 + 10 ** ((elo_ratings.get(team_b, 1500) - elo_ratings.get(team_a, 1500)) / 400))
        expected_b = 1 - expected_a

        # Update Elo ratings based on match outcome
        elo_ratings[team_a] = elo_ratings.get(team_a, 1500) + k * (score_a - expected_a)
        elo_ratings[team_b] = elo_ratings.get(team_b, 1500) + k * (score_b - expected_b)

    return elo_ratings

# Function to simulate matches and calculate accuracy
def simulate_matches(data, elo_ratings):
    total_matches = len(data)
    correct_predictions = 0

    # Iterate over the data to simulate matches and compare with actual outcomes
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Simulate match outcome based on Elo ratings
        simulated_score_a = np.random.poisson(elo_ratings.get(team_a, 1500))
        simulated_score_b = np.random.poisson(elo_ratings.get(team_b, 1500))

        # Check if the simulated outcome matches the actual outcome
        if simulated_score_a > simulated_score_b and score_a > score_b:
            correct_predictions += 1
        elif simulated_score_a < simulated_score_b and score_a < score_b:
            correct_predictions += 1

    # Calculate accuracy rate
    accuracy = correct_predictions / total_matches
    return accuracy

# Loop over each K value and calculate accuracy
best_k = None
best_accuracy = 0

for k in k_values:
    # Calculate Elo ratings using the training data
    elo_ratings = calculate_elo_ratings(training_data, k)

    # Simulate matches and calculate accuracy using the testing data
    accuracy = simulate_matches(testing_data, elo_ratings)

    # Update best K value if accuracy is improved
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Print the best K value and its corresponding accuracy
print(f"Best K value: {best_k}")
print(f"Accuracy: {best_accuracy}")


Best K value: 23
Accuracy: 0.6578947368421053


## K-value of 18-19 

In [198]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set the range of K values to test
k_values = range(10, 51)

# Read the data from the CSV file
data = pd.read_csv("/Users/ryansung/Downloads/restructured_1819.csv")

# Split the data into training and testing data
train_ratio = 0.8  # Ratio of training data to the total data

# Randomly split the data into training and testing sets
training_data, testing_data = train_test_split(data, train_size=train_ratio, random_state=42)


# Function to calculate Elo ratings
def calculate_elo_ratings(data, k):
    # Initialize Elo ratings for each team
    elo_ratings = {}

    # Iterate over the data to calculate Elo ratings
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Calculate expected probabilities using Elo ratings
        expected_a = 1 / (1 + 10 ** ((elo_ratings.get(team_b, 1500) - elo_ratings.get(team_a, 1500)) / 400))
        expected_b = 1 - expected_a

        # Update Elo ratings based on match outcome
        elo_ratings[team_a] = elo_ratings.get(team_a, 1500) + k * (score_a - expected_a)
        elo_ratings[team_b] = elo_ratings.get(team_b, 1500) + k * (score_b - expected_b)

    return elo_ratings

# Function to simulate matches and calculate accuracy
def simulate_matches(data, elo_ratings):
    total_matches = len(data)
    correct_predictions = 0

    # Iterate over the data to simulate matches and compare with actual outcomes
    for _, match in data.iterrows():
        team_a = match['TeamA']
        team_b = match['TeamB']
        score_a = match['ScoreA']
        score_b = match['ScoreB']

        # Simulate match outcome based on Elo ratings
        simulated_score_a = np.random.poisson(elo_ratings.get(team_a, 1500))
        simulated_score_b = np.random.poisson(elo_ratings.get(team_b, 1500))

        # Check if the simulated outcome matches the actual outcome
        if simulated_score_a > simulated_score_b and score_a > score_b:
            correct_predictions += 1
        elif simulated_score_a < simulated_score_b and score_a < score_b:
            correct_predictions += 1

    # Calculate accuracy rate
    accuracy = correct_predictions / total_matches
    return accuracy

# Loop over each K value and calculate accuracy
best_k = None
best_accuracy = 0

for k in k_values:
    # Calculate Elo ratings using the training data
    elo_ratings = calculate_elo_ratings(training_data, k)

    # Simulate matches and calculate accuracy using the testing data
    accuracy = simulate_matches(testing_data, elo_ratings)

    # Update best K value if accuracy is improved
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

# Print the best K value and its corresponding accuracy
print(f"Best K value: {best_k}")
print(f"Accuracy: {best_accuracy}")


Best K value: 12
Accuracy: 0.5988023952095808


## Converting 2122 to excel for erasing Korean

In [85]:
# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_2122.csv", encoding='cp949')

# Specify the output path for the XLSX file
output_path = "/Users/ryansung/Downloads/vleague_2122.xlsx"

# Convert and save the data to XLSX
data.to_excel(output_path, index=False)

### Reconsctructing 2122 data

In [97]:
import pandas as pd

# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_2122.csv")

# Print the column names in the original data
print("Column names:")
print(data.columns)

# Create a new DataFrame for restructured data
restructured_data = pd.DataFrame(columns=['TeamA', 'TeamB', 'ScoreA', 'ScoreB'])

# Adjust the column names in the code based on the printed output

# Iterate over each row in the original data
for index, row in data.iterrows():
    # Check if the desired column names exist in the row
    if 'TeamA' in row and 'TeamB' in row:
        team_a = row['TeamA']
        team_b = row['TeamB']
        
        # Iterate over set columns (ASet1, ASet2, ..., ASet5)
        for i in range(1, 6):
            set_column_a = f"ASet{i}"
            set_column_b = f"BSet{i}"
            
            # Check if the set columns exist in the row
            if set_column_a in row and set_column_b in row:
                score_a = row[set_column_a]
                score_b = row[set_column_b]
                
                # Check if both set scores are not NaN and not 0
                if pd.notnull(score_a) and pd.notnull(score_b) and score_a != 0 and score_b != 0:
                    # Add the data to the restructured DataFrame
                    restructured_data = pd.concat([restructured_data, pd.DataFrame({'TeamA': [team_a], 'TeamB': [team_b], 'ScoreA': [score_a], 'ScoreB': [score_b]})], ignore_index=True)

# Save the restructured data to a new XLSX file
output_path = "/Users/ryansung/Downloads/restructured_2122.xlsx"
restructured_data.to_excel(output_path, index=False)

print("Data restructured and saved to restructured_data2122.xlsx successfully.")

Column names:
Index(['Unnamed: 0', '경기번호', 'TeamA', 'ASet1', 'ASet2', 'ASet3', 'ASet4',
       'ASet5', 'Atotal', 'TeamB', 'BSet1', 'BSet2', 'BSet3', 'BSet4', 'BSet5',
       'B팀총점', '경기시간', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5'],
      dtype='object')
Data restructured and saved to restructured_data2122.xlsx successfully.


## Confirming 2122 data

In [88]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/ryansung/Downloads/restructured_2122.csv')
df.head()

# Add a new column 'Win A' with the desired values
df['Win A'] = 1 * (df['ScoreA'] > df['ScoreB'])

# Add a new column 'Win B' with the desired values
df['Win B'] = 1 * (df['ScoreA'] < df['ScoreB'])

# Fill NaN values in 'Win A' and 'Win B' columns with 0
df[['Win A', 'Win B']] = df[['Win A', 'Win B']].fillna(0)

# Print the updated DataFrame
df.head()

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Win A,Win B
0,대한항공,우리카드,25,18,1,0
1,대한항공,우리카드,27,25,1,0
2,대한항공,우리카드,19,25,0,1
3,대한항공,우리카드,25,22,1,0
4,GS칼텍스,흥국생명,25,21,1,0


In [125]:
# Filter the DataFrame for rows where Team A is "대한항공" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '대한항공', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '대한항공', 'Win B'].sum()
# Print the total Win A values for "대한항공"
print("Total Win values for 대한항공:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "현대캐피탈" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '현대캐피탈', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '현대캐피탈', 'Win B'].sum()
# Print the total Win A values for "현대캐피탈"
print("Total Win values for 현대캐피탈:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "우리카드" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '우리카드', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '우리카드', 'Win B'].sum()
# Print the total Win A values for "우리카드"
print("Total Win values for 우리카드:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "한국전력" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '한국전력', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '한국전력', 'Win B'].sum()
# Print the total Win A values for "한국전력"
print("Total Win values for 한국전력:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "OK금융그룹" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'OK금융그룹', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'OK금융그룹', 'Win B'].sum()
# Print the total Win A values for "OK금융그룹"
print("Total Win values for OK금융그룹:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "KB손해보험" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'KB손해보험', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'KB손해보험', 'Win B'].sum()
# Print the total Win A values for "KB손해보험"
print("Total Win values for KB손해보험:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "삼성화재" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '삼성화재', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '삼성화재', 'Win B'].sum()
# Print the total Win A values for "삼성화재"
print("Total Win values for 삼성화재:", total_wins_a + total_wins_b)

Total Win values for 대한항공: 92
Total Win values for 현대캐피탈: 63
Total Win values for 우리카드: 84
Total Win values for 한국전력: 75
Total Win values for OK금융그룹: 75
Total Win values for KB손해보험: 73
Total Win values for 삼성화재: 46


## Converting 2021 excel to erase Korean

In [101]:
# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_2021.csv")

# Specify the output path for the XLSX file
output_path = "/Users/ryansung/Downloads/vleague_2021.xlsx"

# Convert and save the data to XLSX
data.to_excel(output_path, index=False)

### Reconstructing 2021 data

In [103]:
import pandas as pd

# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_2021.csv")

# Print the column names in the original data
print("Column names:")
print(data.columns)

# Create a new DataFrame for restructured data
restructured_data = pd.DataFrame(columns=['TeamA', 'TeamB', 'ScoreA', 'ScoreB'])

# Adjust the column names in the code based on the printed output

# Iterate over each row in the original data
for index, row in data.iterrows():
    # Check if the desired column names exist in the row
    if 'TeamA' in row and 'TeamB' in row:
        team_a = row['TeamA']
        team_b = row['TeamB']
        
        # Iterate over set columns (ASet1, ASet2, ..., ASet5)
        for i in range(1, 6):
            set_column_a = f"ASet{i}"
            set_column_b = f"BSet{i}"
            
            # Check if the set columns exist in the row
            if set_column_a in row and set_column_b in row:
                score_a = row[set_column_a]
                score_b = row[set_column_b]
                
                # Check if both set scores are not NaN and not 0
                if pd.notnull(score_a) and pd.notnull(score_b) and score_a != 0 and score_b != 0:
                    # Add the data to the restructured DataFrame
                    restructured_data = pd.concat([restructured_data, pd.DataFrame({'TeamA': [team_a], 'TeamB': [team_b], 'ScoreA': [score_a], 'ScoreB': [score_b]})], ignore_index=True)

# Save the restructured data to a new XLSX file
output_path = "/Users/ryansung/Downloads/restructured_2021.xlsx"
restructured_data.to_excel(output_path, index=False)

print("Data restructured and saved to restructured_data2021.xlsx successfully.")

Column names:
Index(['Unnamed: 0', '경기번호', 'TeamA', 'ASet1', 'ASet2', 'ASet3', 'ASet4',
       'ASet5', 'A팀총점', 'TeamB', 'BSet1', 'BSet2', 'BSet3', 'BSet4', 'BSet5',
       'B팀총점', '경기시간', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5'],
      dtype='object')
Data restructured and saved to restructured_data2021.xlsx successfully.


## Confirming 2021 data

In [124]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/ryansung/Downloads/restructured_2021.csv')
df.head()

# Add a new column 'Win A' with the desired values
df['Win A'] = 1 * (df['ScoreA'] > df['ScoreB'])

# Add a new column 'Win B' with the desired values
df['Win B'] = 1 * (df['ScoreA'] < df['ScoreB'])

# Fill NaN values in 'Win A' and 'Win B' columns with 0
df[['Win A', 'Win B']] = df[['Win A', 'Win B']].fillna(0)

# Print the updated DataFrame
df.head()

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Win A,Win B
0,우리카드,대한항공,20,25,0,1
1,우리카드,대한항공,21,25,0,1
2,우리카드,대한항공,25,23,1,0
3,우리카드,대한항공,25,23,1,0
4,우리카드,대한항공,7,15,0,1


In [107]:
# Filter the DataFrame for rows where Team A is "대한항공" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '대한항공', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '대한항공', 'Win B'].sum()
# Print the total Win A values for "대한항공"
print("Total Win values for 대한항공:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "현대캐피탈" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '현대캐피탈', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '현대캐피탈', 'Win B'].sum()
# Print the total Win A values for "현대캐피탈"
print("Total Win values for 현대캐피탈:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "우리카드" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '우리카드', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '우리카드', 'Win B'].sum()
# Print the total Win A values for "우리카드"
print("Total Win values for 우리카드:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "한국전력" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '한국전력', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '한국전력', 'Win B'].sum()
# Print the total Win A values for "한국전력"
print("Total Win values for 한국전력:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "OK금융그룹" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'OK금융그룹', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'OK금융그룹', 'Win B'].sum()
# Print the total Win A values for "OK금융그룹"
print("Total Win values for OK금융그룹:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "KB손해보험" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'KB손해보험', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'KB손해보험', 'Win B'].sum()
# Print the total Win A values for "KB손해보험"
print("Total Win values for KB손해보험:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "삼성화재" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '삼성화재', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '삼성화재', 'Win B'].sum()
# Print the total Win A values for "삼성화재"
print("Total Win values for 삼성화재:", total_wins_a + total_wins_b)

Total Win values for 대한항공: 92
Total Win values for 현대캐피탈: 63
Total Win values for 우리카드: 84
Total Win values for 한국전력: 75
Total Win values for OK금융그룹: 75
Total Win values for KB손해보험: 73
Total Win values for 삼성화재: 46


## Converting 1920 excel to erase Korean

In [110]:
# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_1920.csv", encoding='cp949')

# Specify the output path for the XLSX file
output_path = "/Users/ryansung/Downloads/vleague_1920.xlsx"

# Convert and save the data to XLSX
data.to_excel(output_path, index=False)

### Reconstructing 2021 data

In [111]:
import pandas as pd

# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_1920.csv")

# Print the column names in the original data
print("Column names:")
print(data.columns)

# Create a new DataFrame for restructured data
restructured_data = pd.DataFrame(columns=['TeamA', 'TeamB', 'ScoreA', 'ScoreB'])

# Adjust the column names in the code based on the printed output

# Iterate over each row in the original data
for index, row in data.iterrows():
    # Check if the desired column names exist in the row
    if 'TeamA' in row and 'TeamB' in row:
        team_a = row['TeamA']
        team_b = row['TeamB']
        
        # Iterate over set columns (ASet1, ASet2, ..., ASet5)
        for i in range(1, 6):
            set_column_a = f"ASet{i}"
            set_column_b = f"BSet{i}"
            
            # Check if the set columns exist in the row
            if set_column_a in row and set_column_b in row:
                score_a = row[set_column_a]
                score_b = row[set_column_b]
                
                # Check if both set scores are not NaN and not 0
                if pd.notnull(score_a) and pd.notnull(score_b) and score_a != 0 and score_b != 0:
                    # Add the data to the restructured DataFrame
                    restructured_data = pd.concat([restructured_data, pd.DataFrame({'TeamA': [team_a], 'TeamB': [team_b], 'ScoreA': [score_a], 'ScoreB': [score_b]})], ignore_index=True)

# Save the restructured data to a new XLSX file
output_path = "/Users/ryansung/Downloads/restructured_1920.xlsx"
restructured_data.to_excel(output_path, index=False)

print("Data restructured and saved to restructured_data1920.xlsx successfully.")

Column names:
Index(['Unnamed: 0', '경기번호', 'TeamA', 'ASet1', 'ASet2', 'ASet3', 'ASet4',
       'ASet5', 'Atotal', 'TeamB', 'BSet1', 'BSet2', 'BSet3', 'BSet4', 'BSet5',
       'Btotal', '경기시간', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5'],
      dtype='object')
Data restructured and saved to restructured_data1920.xlsx successfully.


## Confirming 1920 data

In [113]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/ryansung/Downloads/restructured_1920.csv')
df.head()

# Add a new column 'Win A' with the desired values
df['Win A'] = 1 * (df['ScoreA'] > df['ScoreB'])

# Add a new column 'Win B' with the desired values
df['Win B'] = 1 * (df['ScoreA'] < df['ScoreB'])

# Fill NaN values in 'Win A' and 'Win B' columns with 0
df[['Win A', 'Win B']] = df[['Win A', 'Win B']].fillna(0)

# Print the updated DataFrame
df.head()

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Win A,Win B
0,현대캐피탈,대한항공,23,25,0,1
1,현대캐피탈,대한항공,23,25,0,1
2,현대캐피탈,대한항공,25,20,1,0
3,현대캐피탈,대한항공,22,25,0,1
4,삼성화재,우리카드,14,25,0,1


In [121]:
# Filter the DataFrame for rows where Team A is "대한항공" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '대한항공', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '대한항공', 'Win B'].sum()
# Print the total Win A values for "대한항공"
print("Total Win values for 대한항공:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "현대캐피탈" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '현대캐피탈', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '현대캐피탈', 'Win B'].sum()
# Print the total Win A values for "현대캐피탈"
print("Total Win values for 현대캐피탈:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "우리카드" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '우리카드', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '우리카드', 'Win B'].sum()
# Print the total Win A values for "우리카드"
print("Total Win values for 우리카드:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "한국전력" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '한국전력', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '한국전력', 'Win B'].sum()
# Print the total Win A values for "한국전력"
print("Total Win values for 한국전력:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "OK금융그룹" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'OK저축은행', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'OK저축은행', 'Win B'].sum()
# Print the total Win A values for "OK금융그룹"
print("Total Win values for OK저축은행:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "KB손해보험" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'KB손해보험', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'KB손해보험', 'Win B'].sum()
# Print the total Win A values for "KB손해보험"
print("Total Win values for KB손해보험:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "삼성화재" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '삼성화재', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '삼성화재', 'Win B'].sum()
# Print the total Win A values for "삼성화재"
print("Total Win values for 삼성화재:", total_wins_a + total_wins_b)

Total Win values for 대한항공: 92
Total Win values for 현대캐피탈: 85
Total Win values for 우리카드: 77
Total Win values for 한국전력: 40
Total Win values for OK저축은행: 65
Total Win values for KB손해보험: 64
Total Win values for 삼성화재: 73


## Converting 1819 excel to erase Korean

In [116]:
# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_1819.csv", encoding='cp949')

# Specify the output path for the XLSX file
output_path = "/Users/ryansung/Downloads/vleague_1819.xlsx"

# Convert and save the data to XLSX
data.to_excel(output_path, index=False)

### Reconstructing 1819 data

In [117]:
import pandas as pd

# Read the original CSV file
data = pd.read_csv("/Users/ryansung/Downloads/vleague_1819.csv")

# Print the column names in the original data
print("Column names:")
print(data.columns)

# Create a new DataFrame for restructured data
restructured_data = pd.DataFrame(columns=['TeamA', 'TeamB', 'ScoreA', 'ScoreB'])

# Adjust the column names in the code based on the printed output

# Iterate over each row in the original data
for index, row in data.iterrows():
    # Check if the desired column names exist in the row
    if 'TeamA' in row and 'TeamB' in row:
        team_a = row['TeamA']
        team_b = row['TeamB']
        
        # Iterate over set columns (ASet1, ASet2, ..., ASet5)
        for i in range(1, 6):
            set_column_a = f"ASet{i}"
            set_column_b = f"BSet{i}"
            
            # Check if the set columns exist in the row
            if set_column_a in row and set_column_b in row:
                score_a = row[set_column_a]
                score_b = row[set_column_b]
                
                # Check if both set scores are not NaN and not 0
                if pd.notnull(score_a) and pd.notnull(score_b) and score_a != 0 and score_b != 0:
                    # Add the data to the restructured DataFrame
                    restructured_data = pd.concat([restructured_data, pd.DataFrame({'TeamA': [team_a], 'TeamB': [team_b], 'ScoreA': [score_a], 'ScoreB': [score_b]})], ignore_index=True)

# Save the restructured data to a new XLSX file
output_path = "/Users/ryansung/Downloads/restructured_1819.xlsx"
restructured_data.to_excel(output_path, index=False)

print("Data restructured and saved to restructured_data1819.xlsx successfully.")

Column names:
Index(['Unnamed: 0', '경기번호', 'TeamA', 'ASet1', 'ASet2', 'ASet3', 'ASet4',
       'ASet5', 'Atotal', 'TeamB', 'BSet1', 'BSet2', 'BSet3', 'BSet4', 'BSet5',
       'Btotal', '경기시간', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5'],
      dtype='object')
Data restructured and saved to restructured_data1819.xlsx successfully.


## Confirming 1819 data

In [118]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/ryansung/Downloads/restructured_1819.csv')
df.head()

# Add a new column 'Win A' with the desired values
df['Win A'] = 1 * (df['ScoreA'] > df['ScoreB'])

# Add a new column 'Win B' with the desired values
df['Win B'] = 1 * (df['ScoreA'] < df['ScoreB'])

# Fill NaN values in 'Win A' and 'Win B' columns with 0
df[['Win A', 'Win B']] = df[['Win A', 'Win B']].fillna(0)

# Print the updated DataFrame
df.head()

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Win A,Win B
0,대한항공,현대캐피탈,21,25,0,1
1,대한항공,현대캐피탈,23,25,0,1
2,대한항공,현대캐피탈,20,25,0,1
3,삼성화재,우리카드,20,25,0,1
4,삼성화재,우리카드,25,19,1,0


In [122]:
# Filter the DataFrame for rows where Team A is "대한항공" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '대한항공', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '대한항공', 'Win B'].sum()
# Print the total Win A values for "대한항공"
print("Total Win values for 대한항공:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "현대캐피탈" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '현대캐피탈', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '현대캐피탈', 'Win B'].sum()
# Print the total Win A values for "현대캐피탈"
print("Total Win values for 현대캐피탈:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "우리카드" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '우리카드', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '우리카드', 'Win B'].sum()
# Print the total Win A values for "우리카드"
print("Total Win values for 우리카드:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "한국전력" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '한국전력', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '한국전력', 'Win B'].sum()
# Print the total Win A values for "한국전력"
print("Total Win values for 한국전력:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "OK저축은행" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'OK저축은행', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'OK저축은행', 'Win B'].sum()
# Print the total Win A values for "OK저축은행"
print("Total Win values for OK저축은행:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "KB손해보험" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == 'KB손해보험', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == 'KB손해보험', 'Win B'].sum()
# Print the total Win A values for "KB손해보험"
print("Total Win values for KB손해보험:", total_wins_a + total_wins_b)

# Filter the DataFrame for rows where Team A is "삼성화재" and sum the Win A values
total_wins_a = df.loc[df['TeamA'] == '삼성화재', 'Win A'].sum()
total_wins_b = df.loc[df['TeamB'] == '삼성화재', 'Win B'].sum()
# Print the total Win A values for "삼성화재"
print("Total Win values for 삼성화재:", total_wins_a + total_wins_b)

Total Win values for 대한항공: 92
Total Win values for 현대캐피탈: 85
Total Win values for 우리카드: 77
Total Win values for 한국전력: 40
Total Win values for OK저축은행: 65
Total Win values for KB손해보험: 64
Total Win values for 삼성화재: 73


## acc_check

In [172]:
import pandas as pd

# Assuming your dataset is stored in a pandas DataFrame called 'df'
# Replace 'your_dataset.csv' with the actual file name or use your own data loading method

# Load the dataset
df = pd.read_csv('/Users/ryansung/Downloads/acc_check.csv')

# Create an empty list to store unique team pairs
encountered_pairs = []

# Create an empty list to store extracted rows
extracted_rows = []

# Iterate through each row in the dataset
for index, row in df.iterrows():
    team_pair = tuple(sorted([row['TeamA'], row['TeamB']]))

    # Check if the pair has already been encountered
    if team_pair not in encountered_pairs:
        # Add the pair to the list of encountered pairs
        encountered_pairs.append(team_pair)
        
        # Add the row to the list of extracted rows
        extracted_rows.append(row)
    else:
        # Check if the reversed pair is also in the encountered pairs list
        reversed_pair = tuple(reversed(team_pair))
        
        if reversed_pair not in encountered_pairs:
            # Add the reversed pair to the list of encountered pairs
            encountered_pairs.append(reversed_pair)
            
            # Add the row to the list of extracted rows
            extracted_rows.append(row)

# Create a new data frame with the extracted rows
new_df = pd.DataFrame(extracted_rows)

# Extract only the odd-numbered rows
odd_numbered_rows = new_df.iloc[::2]



# Print the odd-numbered rows
odd_numbered_rows.head(80)


Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Result,EloA,EloB,WinProb,above50
0,현대캐피탈,대한항공,23,25,0,1531.212,1583.655,0.425096,0
4,삼성화재,우리카드,14,25,0,1549.197,1480.57,0.597497,1
7,KB손해보험,한국전력,22,25,0,1519.086,1352.326,0.723111,1
12,삼성화재,OK저축은행,26,24,1,1549.197,1483.953,0.592805,1
16,현대캐피탈,우리카드,25,13,1,1531.212,1480.57,0.572368,1
21,대한항공,한국전력,25,14,1,1583.655,1352.326,0.791111,1
24,KB손해보험,삼성화재,21,25,0,1519.086,1549.197,0.456775,0
29,흥국생명,한국도로공사,25,17,1,1603.992,1592.073,0.517146,1
33,OK저축은행,우리카드,25,23,1,1483.953,1480.57,0.504868,1
37,IBK기업은행,KGC인삼공사,25,20,1,1488.056,1339.729,0.701372,1


In [173]:
# Get the number of rows
num_rows = len(odd_numbered_rows)

# Print the number of rows
print("Number of rows:", num_rows)

Number of rows: 36


In [174]:
# Extract rows where both "Result" and "above50" are equal to 1
filtered_rows = odd_numbered_rows[(odd_numbered_rows['Result'] == 1) & (odd_numbered_rows['above50'] == 1)]
filtered_rows2 = odd_numbered_rows[(odd_numbered_rows['Result'] == 0) & (odd_numbered_rows['above50'] == 0)]
print(len(filtered_rows)+len(filtered_rows2))

25


## acc_check2

In [175]:
# Load the dataset
df = pd.read_csv('/Users/ryansung/Downloads/acc_check2.csv')

# Create an empty list to store unique team pairs
encountered_pairs = []

# Create an empty list to store extracted rows
extracted_rows = []

# Iterate through each row in the dataset
for index, row in df.iterrows():
    team_pair = tuple(sorted([row['TeamA'], row['TeamB']]))

    # Check if the pair has already been encountered
    if team_pair not in encountered_pairs:
        # Add the pair to the list of encountered pairs
        encountered_pairs.append(team_pair)
        
        # Add the row to the list of extracted rows
        extracted_rows.append(row)
    else:
        # Check if the reversed pair is also in the encountered pairs list
        reversed_pair = tuple(reversed(team_pair))
        
        if reversed_pair not in encountered_pairs:
            # Add the reversed pair to the list of encountered pairs
            encountered_pairs.append(reversed_pair)
            
            # Add the row to the list of extracted rows
            extracted_rows.append(row)

# Create a new data frame with the extracted rows
new_df = pd.DataFrame(extracted_rows)

# Extract only the odd-numbered rows
odd_numbered_rows = new_df.iloc[::2]



# Print the odd-numbered rows
odd_numbered_rows.head(80)

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Result,EloA,EloB,WinProb,above50
0,우리카드,대한항공,20,25,0,1646.432,1650.347,0.494366,0
5,현대건설,GS칼텍스,19,25,0,1580.082,1550.911,0.541882,1
10,한국전력,삼성화재,26,24,1,1338.953,1438.477,0.360567,0
15,KGC인삼공사,IBK기업은행,25,22,1,1530.192,1415.797,0.658926,1
19,현대캐피탈,우리카드,25,21,1,1523.181,1646.432,0.329712,0
22,대한항공,삼성화재,25,13,1,1650.347,1438.477,0.771997,1
26,GS칼텍스,흥국생명,27,29,0,1550.911,1559.579,0.487528,0
30,OK금융그룹,한국전력,25,19,1,1490.28,1338.953,0.704976,1
34,우리카드,KB손해보험,27,29,0,1646.432,1412.329,0.793738,1
38,현대건설,한국도로공사,25,23,1,1580.082,1363.439,0.776797,1


In [176]:
# Get the number of rows
num_rows = len(odd_numbered_rows)

# Print the number of rows
print("Number of rows:", num_rows)

Number of rows: 36


In [177]:
# Extract rows where both "Result" and "above50" are equal to 1
filtered_rows = odd_numbered_rows[(odd_numbered_rows['Result'] == 1) & (odd_numbered_rows['above50'] == 1)]
filtered_rows2 = odd_numbered_rows[(odd_numbered_rows['Result'] == 0) & (odd_numbered_rows['above50'] == 0)]
print(len(filtered_rows)+len(filtered_rows2))

19


## acc_check3

In [178]:
# Load the dataset
df = pd.read_csv('/Users/ryansung/Downloads/acc_check3.csv')

# Create an empty list to store unique team pairs
encountered_pairs = []

# Create an empty list to store extracted rows
extracted_rows = []

# Iterate through each row in the dataset
for index, row in df.iterrows():
    team_pair = tuple(sorted([row['TeamA'], row['TeamB']]))

    # Check if the pair has already been encountered
    if team_pair not in encountered_pairs:
        # Add the pair to the list of encountered pairs
        encountered_pairs.append(team_pair)
        
        # Add the row to the list of extracted rows
        extracted_rows.append(row)
    else:
        # Check if the reversed pair is also in the encountered pairs list
        reversed_pair = tuple(reversed(team_pair))
        
        if reversed_pair not in encountered_pairs:
            # Add the reversed pair to the list of encountered pairs
            encountered_pairs.append(reversed_pair)
            
            # Add the row to the list of extracted rows
            extracted_rows.append(row)

# Create a new data frame with the extracted rows
new_df = pd.DataFrame(extracted_rows)

# Extract only the odd-numbered rows
odd_numbered_rows = new_df.iloc[::2]



# Print the odd-numbered rows
odd_numbered_rows.head(80)

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Result,EloA,EloB,WinProb,above50
0,대한항공,우리카드,25,18,1,1627.238,1604.082,0.533275,1
4,GS칼텍스,흥국생명,25,21,1,1577.789,1486.829,0.627991,1
7,현대캐피탈,OK금융그룹,23,25,0,1484.798,1476.434,0.512034,1
11,현대건설,IBK기업은행,23,25,0,1484.465,1467.224,0.524791,1
15,삼성화재,한국전력,20,25,0,1364.935,1478.963,0.341549,0
18,페퍼저축은행,KGC인삼공사,25,16,1,1500.0,1502.496,0.496408,0
22,KB손해보험,현대캐피탈,25,22,1,1463.549,1484.798,0.469458,0
27,한국도로공사,현대건설,13,25,0,1481.197,1484.465,0.495297,0
30,OK금융그룹,우리카드,21,25,0,1476.434,1604.082,0.324142,0
35,IBK기업은행,흥국생명,25,22,1,1467.224,1486.829,0.471816,0


In [179]:
# Get the number of rows
num_rows = len(odd_numbered_rows)

# Print the number of rows
print("Number of rows:", num_rows)

Number of rows: 42


In [180]:
# Extract rows where both "Result" and "above50" are equal to 1
filtered_rows = odd_numbered_rows[(odd_numbered_rows['Result'] == 1) & (odd_numbered_rows['above50'] == 1)]
filtered_rows2 = odd_numbered_rows[(odd_numbered_rows['Result'] == 0) & (odd_numbered_rows['above50'] == 0)]
print(len(filtered_rows)+len(filtered_rows2))

22


## acc_check4

In [181]:
# Load the dataset
df = pd.read_csv('/Users/ryansung/Downloads/acc_check4.csv')

# Create an empty list to store unique team pairs
encountered_pairs = []

# Create an empty list to store extracted rows
extracted_rows = []

# Iterate through each row in the dataset
for index, row in df.iterrows():
    team_pair = tuple(sorted([row['TeamA'], row['TeamB']]))

    # Check if the pair has already been encountered
    if team_pair not in encountered_pairs:
        # Add the pair to the list of encountered pairs
        encountered_pairs.append(team_pair)
        
        # Add the row to the list of extracted rows
        extracted_rows.append(row)
    else:
        # Check if the reversed pair is also in the encountered pairs list
        reversed_pair = tuple(reversed(team_pair))
        
        if reversed_pair not in encountered_pairs:
            # Add the reversed pair to the list of encountered pairs
            encountered_pairs.append(reversed_pair)
            
            # Add the row to the list of extracted rows
            extracted_rows.append(row)

# Create a new data frame with the extracted rows
new_df = pd.DataFrame(extracted_rows)

# Extract only the odd-numbered rows
odd_numbered_rows = new_df.iloc[::2]



# Print the odd-numbered rows
odd_numbered_rows.head(80)

Unnamed: 0,TeamA,TeamB,ScoreA,ScoreB,Result,EloA,EloB,WinProb,above50
0,대한항공,KB손해보험,25,21,1,1594.046,1502.312,0.629031,1
4,현대건설,한국도로공사,25,13,1,1637.624,1628.489,0.513143,1
7,OK금융그룹,한국전력,18,25,0,1431.919,1545.19,0.34253,0
10,IBK기업은행,GS칼텍스,19,25,0,1491.162,1625.218,0.316114,0
13,삼성화재,현대캐피탈,23,25,0,1442.913,1427.729,0.521838,1
17,흥국생명,페퍼저축은행,25,16,1,1398.627,1292.186,0.648562,1
20,OK금융그룹,우리카드,18,25,0,1431.919,1555.889,0.328798,0
23,IBK기업은행,KGC인삼공사,20,25,0,1491.162,1426.694,0.591727,1
28,KB손해보험,한국전력,28,26,1,1502.312,1545.19,0.438605,0
32,한국도로공사,GS칼텍스,25,21,1,1628.489,1625.218,0.504707,1


In [182]:
# Get the number of rows
num_rows = len(odd_numbered_rows)

# Print the number of rows
print("Number of rows:", num_rows)

Number of rows: 42


In [183]:
# Extract rows where both "Result" and "above50" are equal to 1
filtered_rows = odd_numbered_rows[(odd_numbered_rows['Result'] == 1) & (odd_numbered_rows['above50'] == 1)]
filtered_rows2 = odd_numbered_rows[(odd_numbered_rows['Result'] == 0) & (odd_numbered_rows['above50'] == 0)]
print(len(filtered_rows)+len(filtered_rows2))

19


In [184]:
#Overall accuracy predicting first sets of the next year using prev elo = 
85/156

0.5448717948717948

In [188]:
import pandas as pd

# Load the data sets
df1 = pd.read_csv('/Users/ryansung/Downloads/vnl_elo_ratings.csv')
df2 = pd.read_csv('/Users/ryansung/Downloads/vnl_elo_ratings2.csv')
df3 = pd.read_csv('/Users/ryansung/Downloads/vnl_elo_ratings3.csv')
df4 = pd.read_csv('/Users/ryansung/Downloads/vnl_elo_ratings4.csv')
df5 = pd.read_csv('/Users/ryansung/Downloads/vnl_elo_ratings5.csv')

# Concatenate the data sets vertically
combined_df = pd.concat([df1, df2, df3, df4, df5])

print(len(combined_df))


4335


In [190]:
combined_df.head(4335)

Unnamed: 0.1,Unnamed: 0,team.A,team.B,p.A,wins.A,update.A,update.B,elo.A,elo.B
0,1,대한항공,현대캐피탈,0.50,0,-11.00,11.00,1489.00,1511.00
1,2,대한항공,현대캐피탈,0.47,0,-10.30,10.30,1478.70,1521.30
2,3,대한항공,현대캐피탈,0.44,0,-9.66,9.66,1469.04,1530.96
3,4,삼성화재,우리카드,0.50,0,-11.00,11.00,1489.00,1511.00
4,5,삼성화재,우리카드,0.47,1,11.70,-11.70,1500.70,1499.30
...,...,...,...,...,...,...,...,...,...
975,976,삼성화재,대한항공,0.35,1,11.00,-11.00,1479.06,1562.26
976,977,흥국생명,현대건설,0.66,1,5.84,-5.84,1616.58,1492.53
977,978,흥국생명,현대건설,0.67,0,-11.41,11.41,1605.17,1503.95
978,979,흥국생명,현대건설,0.64,1,6.09,-6.09,1611.26,1497.85


In [191]:
# Filter the dataset based on conditions
filtered_df = combined_df[(combined_df['p.A'] > 0.5) & (combined_df['wins.A'] == 1)]
filtered_df2 = combined_df[(combined_df['p.A'] < 0.5) & (combined_df['wins.A'] == 0)]
print(len(filtered_df)+len(filtered_df2))

2477


In [192]:
# Count the number of correct predictions
correct_predictions = len(filtered_df)+len(filtered_df2)

# Calculate the total number of instances
total_instances = len(combined_df)

# Calculate the accuracy
accuracy = correct_predictions / total_instances

# Print the result
print("Accuracy:", accuracy)

Accuracy: 0.5713956170703576
