In [1]:
import pandas as pd
import statsmodels.api as sm

# Define the chunk size
chunk_size = 1000

# Function to calculate the nth triangular number
def triangular_number(n):
    return n * (n + 1) // 2

# Create an iterator to read in chunks
chunk_iter = pd.read_csv('replay_data_public.WOE.PremierDraft.csv', chunksize=chunk_size)

# Process only the first chunk
for chunk in chunk_iter:
    # Convert 'num_turns', 'won', and relevant mana columns to numeric, replacing non-numeric values with NaN
    chunk['num_turns'] = pd.to_numeric(chunk['num_turns'], errors='coerce').fillna(9)
    for i in range(1, 10):
        chunk[f'user_turn_{i}_user_mana_spent'] = pd.to_numeric(chunk[f'user_turn_{i}_user_mana_spent'], errors='coerce').fillna(0)
        chunk[f'user_turn_{i}_oppo_mana_spent'] = pd.to_numeric(chunk[f'user_turn_{i}_oppo_mana_spent'], errors='coerce').fillna(0)
        chunk[f'oppo_turn_{i}_user_mana_spent'] = pd.to_numeric(chunk[f'oppo_turn_{i}_user_mana_spent'], errors='coerce').fillna(0)
        chunk[f'oppo_turn_{i}_oppo_mana_spent'] = pd.to_numeric(chunk[f'oppo_turn_{i}_oppo_mana_spent'], errors='coerce').fillna(0)

    # List to store the mana spent for each game in the chunk
    mana_spent_per_game = []

    # Iterate through each row (game) in the chunk
    for index, row in chunk.iterrows():
        # Initialize running totals for the current game
        user_mana_total, oppo_mana_total = 0, 0
        num_turns = row['num_turns']
        max_turns = min(num_turns, 9)

        # Iterate through the first 9 pairs of turns or the number of turns in the game
        for i in range(1, max_turns + 1):
            # Add the mana spent for the user and opponent in each turn
            user_mana_total += row[f'user_turn_{i}_user_mana_spent'] + row[f'oppo_turn_{i}_user_mana_spent']
            oppo_mana_total += row[f'user_turn_{i}_oppo_mana_spent'] + row[f'oppo_turn_{i}_oppo_mana_spent']

        # Calculate the nth triangular number for num_turns
        nth_triangular_number = triangular_number(num_turns)

        # Determine if the player who spent more mana won the game
        more_mana_won = (user_mana_total > oppo_mana_total and row['won']) or \
                        (oppo_mana_total > user_mana_total and not row['won'])


        # Add the totals for the current game to the list
        mana_spent_per_game.append({
            'game': index, 
            'user_mana_total': user_mana_total, 
            'oppo_mana_total': oppo_mana_total,
            'num_turns': num_turns, 
            'nth_triangular_number': nth_triangular_number,
            'won': row['won'],
            'more_mana_won': more_mana_won  # Updated field name for clarity

        })

    # Create a DataFrame from the list for easier viewing
    mana_spent_per_game_df = pd.DataFrame(mana_spent_per_game)
    mana_spent_per_game_df['mana_diff'] = mana_spent_per_game_df['user_mana_total'] - mana_spent_per_game_df['oppo_mana_total']


    # Convert 'won' to an integer (1 for True, 0 for False)
    mana_spent_per_game_df['won_int'] = mana_spent_per_game_df['won'].astype(int)

    # Perform logistic regression
    X = mana_spent_per_game_df[['mana_diff']]  # Predictor variable
    y = mana_spent_per_game_df['won_int']      # Outcome variable
    X = sm.add_constant(X)
    model = sm.Logit(y, X).fit()

    # Filter the DataFrame for games where num_turns < 9
    filtered_df = mana_spent_per_game_df[mana_spent_per_game_df['num_turns'] < 9]

    # Calculate the ratio of 'user_spent_more_and_won' being True
    true_count = mana_spent_per_game_df['more_mana_won'].sum()
    total_count = len(mana_spent_per_game_df)
    ratio = true_count / total_count

    # Display the mana spent and additional info for each game in the first chunk
    print("Mana Spent and Additional Info for Each Game in the First Chunk:")
    print(mana_spent_per_game_df.head(40))

    # Print the ratio
    print("Ratio of 'user_spent_more_and_won' being True out of total rows:")
    print(ratio)
    print(model.summary())
    # Stop after processing the first chunk
    break

  for chunk in chunk_iter:


Optimization terminated successfully.
         Current function value: 0.563634
         Iterations 6
Mana Spent and Additional Info for Each Game in the First Chunk:
    game  user_mana_total  oppo_mana_total  num_turns  nth_triangular_number  \
0      0             21.0             20.0          7                     28   
1      1             13.0              9.0          6                     21   
2      2             17.0             16.0          8                     36   
3      3             18.0             15.0          6                     21   
4      4             22.0             26.0          7                     28   
5      5             25.0             38.0         10                     55   
6      6             18.0             25.0          9                     45   
7      7             24.0             38.0         10                     55   
8      8             31.0             34.0         12                     78   
9      9             21.0        