In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
current_dir = Path().resolve()

data_dir = current_dir / 'data'
os.chdir(data_dir)


In [3]:
os.listdir()

['Cities.csv',
 'Conferences.csv',
 'MConferenceTourneyGames.csv',
 'MGameCities.csv',
 'MMasseyOrdinals.csv',
 'MNCAATourneyCompactResults.csv',
 'MNCAATourneyDetailedResults.csv',
 'MNCAATourneySeedRoundSlots.csv',
 'MNCAATourneySeeds.csv',
 'MNCAATourneySlots.csv',
 'MRegularSeasonCompactResults.csv',
 'MRegularSeasonDetailedResults.csv',
 'MSeasons.csv',
 'MSecondaryTourneyCompactResults.csv',
 'MSecondaryTourneyTeams.csv',
 'MTeamCoaches.csv',
 'MTeamConferences.csv',
 'MTeams.csv',
 'MTeamSpellings.csv',
 'SampleSubmissionStage1.csv',
 'SeedBenchmarkStage1.csv',
 'WConferenceTourneyGames.csv',
 'WGameCities.csv',
 'WNCAATourneyCompactResults.csv',
 'WNCAATourneyDetailedResults.csv',
 'WNCAATourneySeeds.csv',
 'WNCAATourneySlots.csv',
 'WRegularSeasonCompactResults.csv',
 'WRegularSeasonDetailedResults.csv',
 'WSeasons.csv',
 'WSecondaryTourneyCompactResults.csv',
 'WSecondaryTourneyTeams.csv',
 'WTeamConferences.csv',
 'WTeams.csv',
 'WTeamSpellings.csv']

# EDA

In [4]:
files = os.listdir(data_dir)
dataframes = {}

for file in files:
    if file.endswith('.csv'):
        file_path = os.path.join(data_dir, file)
        df_name = os.path.splitext(file)[0]
        try:
            dataframes[df_name] = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            # Try a different encoding if utf-8 fails
            dataframes[df_name] = pd.read_csv(file_path, encoding='latin1')
            
        print(f'Loaded {file} into DataFrame {df_name}')


Loaded Cities.csv into DataFrame Cities
Loaded Conferences.csv into DataFrame Conferences
Loaded MConferenceTourneyGames.csv into DataFrame MConferenceTourneyGames
Loaded MGameCities.csv into DataFrame MGameCities
Loaded MMasseyOrdinals.csv into DataFrame MMasseyOrdinals
Loaded MNCAATourneyCompactResults.csv into DataFrame MNCAATourneyCompactResults
Loaded MNCAATourneyDetailedResults.csv into DataFrame MNCAATourneyDetailedResults
Loaded MNCAATourneySeedRoundSlots.csv into DataFrame MNCAATourneySeedRoundSlots
Loaded MNCAATourneySeeds.csv into DataFrame MNCAATourneySeeds
Loaded MNCAATourneySlots.csv into DataFrame MNCAATourneySlots
Loaded MRegularSeasonCompactResults.csv into DataFrame MRegularSeasonCompactResults
Loaded MRegularSeasonDetailedResults.csv into DataFrame MRegularSeasonDetailedResults
Loaded MSeasons.csv into DataFrame MSeasons
Loaded MSecondaryTourneyCompactResults.csv into DataFrame MSecondaryTourneyCompactResults
Loaded MSecondaryTourneyTeams.csv into DataFrame MSecondar

In [5]:
print('DataFrames available: ')
for df_name in dataframes.keys():
    print(df_name)

DataFrames available: 
Cities
Conferences
MConferenceTourneyGames
MGameCities
MMasseyOrdinals
MNCAATourneyCompactResults
MNCAATourneyDetailedResults
MNCAATourneySeedRoundSlots
MNCAATourneySeeds
MNCAATourneySlots
MRegularSeasonCompactResults
MRegularSeasonDetailedResults
MSeasons
MSecondaryTourneyCompactResults
MSecondaryTourneyTeams
MTeamCoaches
MTeamConferences
MTeams
MTeamSpellings
SampleSubmissionStage1
SeedBenchmarkStage1
WConferenceTourneyGames
WGameCities
WNCAATourneyCompactResults
WNCAATourneyDetailedResults
WNCAATourneySeeds
WNCAATourneySlots
WRegularSeasonCompactResults
WRegularSeasonDetailedResults
WSeasons
WSecondaryTourneyCompactResults
WSecondaryTourneyTeams
WTeamConferences
WTeams
WTeamSpellings


In [41]:
dataframes['MTeams'].head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2025
1,1102,Air Force,1985,2025
2,1103,Akron,1985,2025
3,1104,Alabama,1985,2025
4,1105,Alabama A&M,2000,2025


In [None]:
dataframes['MRegularSeasonCompactResults'].groupby

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0
...,...,...,...,...,...,...,...,...
190766,2025,85,1401,75,1328,68,H,0
190767,2025,85,1405,86,1325,83,A,0
190768,2025,85,1428,69,1153,66,H,0
190769,2025,85,1444,74,1123,71,A,0


In [None]:
def bayes_theorem():
    p_spam = 0.2
    p_non_spam = 0.8
    p_word_given_spam = 0.7
    p_word_given_non_spam = 0.10

    ### calculate p_spam_given_word
    p_word = p_word_given_spam * p_spam + p_word_given_non_spam * p_non_spam
    p_spam_given_word = (p_word_given_spam * p_spam )/ (p_word)

    return p_spam_given_word

print(f"Probability that email is spam given 'win': {bayes_theorem():.4f}")


In [1]:

import numpy as np
import pandas as pd

#markov process using monte carlo simulation

# The robot starts in A.
# It takes 3 steps.
# We want to estimate 
# 𝑃(𝐴 ∣ Ends in C after 3 steps)
# P(A∣Ends in C after 3 steps) using Monte Carlo simulation.


# P (A|Ends in C) == (Runs that started in A and ended in C)/Total runs that end in C

def simulate_markov_process(num_simulations = 100000):
    count_A_given_C = 0 
    count_C = 0

    for _ in range(3):
        if state == 'A':
            state = np.random.choice(['A','B'], p=[0.5,0.5])
        elif state == 'B':
            state = np.random.choice(['A','C'], p=[0.3,0.7])
        elif state == 'C':
            state = 'A'
    
    if state == 'C':
        count_C += 1
        count_A_given_C += 1

    
