import pandas as pd
# ^^^ pyforest auto-imports - don't write above this line


# Main Question: Do American teams have more American Players than Canadian Teams and vice versa for 2018-2019? 

## Imports

In [77]:
import warnings
from functions import *
import pickle
sns.set(style="whitegrid")
from scipy.stats import chi2_contingency

### Data

In [11]:
player_df = pd.read_csv("player_df", index_col=0)
groupby_df_sum = player_df.groupby('Team_id').sum()
groupby_df_sum.reset_index(inplace=True)
groupby_df_count = player_df.groupby('Team_id').count()

<IPython.core.display.Javascript object>

# Calculating Percent of Players in each Group

### Splitting by Nation of Team

In [None]:
# previous method of finding canadian/american teams was mistaken

In [35]:
can_teams = player_df[player_df['Canadian_team'] == True]
can_team_ids = list(can_teams.Team_id.unique())

In [78]:
# making separate dataframes
canadian_teams_groupby = groupby_df_sum[groupby_df_sum['Team_id'].isin(can_team_ids)]
amer_teams_groupby = groupby_df_sum[~groupby_df_sum['Team_id'].isin(can_team_ids)]

In [79]:
all_players_in_can_teams = canadian_teams_groupby.Dummy.sum()
all_can_players_in_can_teams = int(canadian_teams_groupby.Canadian_player.sum())
all_amer_players_in_can_teams = int(canadian_teams_groupby.Amer_player.sum())

In [80]:
all_players_in_amer_teams = amer_teams_groupby.Dummy.sum()
all_can_players_in_amer_teams = int(amer_teams_groupby.Canadian_player.sum())
all_amer_players_in_amer_teams = int(amer_teams_groupby.Amer_player.sum())

#### Calculating percentage of canadians in each group

In [81]:
canadians_in_canada_teams_pct = all_can_players_in_can_teams / all_players_in_can_teams
canadians_in_canada_teams_pct

0.5

In [82]:
canadians_in_amer_teams_pct = all_can_players_in_amer_teams / all_players_in_amer_teams
canadians_in_amer_teams_pct

0.4343891402714932

#### Calculating percentage of americans in each group

In [83]:
americans_in_canada_teams_pct = all_amer_players_in_can_teams / all_players_in_can_teams
americans_in_canada_teams_pct

0.24193548387096775

In [84]:
# slightly more americans in american teams than in canadian teams
americans_in_amer_teams_pct = all_amer_players_in_amer_teams / all_players_in_amer_teams
americans_in_amer_teams_pct

0.2895927601809955

# Testing for Statistical Significance
test for significance (5%)

## Making Contingency Table 

In [85]:
# canadian teams
other_nats_in_can_teams = int(all_players_in_can_teams - (all_can_players_in_can_teams + 
                                                      all_amer_players_in_can_teams))

In [88]:
# american teams
other_nats_in_amer_teams = int(all_players_in_amer_teams - (all_can_players_in_amer_teams + 
                                                      all_amer_players_in_amer_teams))

In [92]:
type(all_can_players_in_amer_teams)

int

In [93]:
#               canadians | americans | other
#  canadian teams
# american  teams
contingency_table = np.array([[all_can_players_in_can_teams, 
                              all_amer_players_in_can_teams, 
                              other_nats_in_can_teams],
                            [all_can_players_in_amer_teams,
                            all_amer_players_in_amer_teams,
                            other_nats_in_amer_teams
                            ]])

## Running Test

In [95]:
chi2, p_value, deg_of_freedom, expected = chi2_contingency(contingency_table)

In [97]:
p_value

0.6326338155050115

In [101]:
contingency_table

array([[31, 15, 16],
       [96, 64, 61]])

In [102]:
expected  

array([[27.82332155, 17.30742049, 16.86925795],
       [99.17667845, 61.69257951, 60.13074205]])

## TEMPORARY CONCLUSION

There is NOT a significant difference in terms of the numbers of canadians, americans, and other nationalities between American and Canadian NHL Teams. 

# Testing by for players playing in home country

The previous test found that between teams in the two countries, there is no difference in terms of the number of canadians, americans, and other nationalities playing in the two countries. Here, I want to ask a more specific question, are there more Canadians playing in canada and Americans playing in America? 

## Canadians playing in canada

### Making contingency table

In [109]:
non_canadians_in_can_team = all_amer_players_in_can_teams + other_nats_in_can_teams
non_canadians_in_amer_teams = all_amer_players_in_amer_teams + other_nats_in_amer_teams

In [110]:
can_contingency_table = np.array([[all_can_players_in_can_teams, non_canadians_in_can_team],
                                 [all_can_players_in_amer_teams, non_canadians_in_amer_teams]])

## Running Test

In [111]:
can_chi2, can_p_value, can_deg_of_freedom, can_expected = chi2_contingency(can_contingency_table)

In [112]:
can_p_value

0.43926931071597886

## Americans Playing in America

In [113]:
non_americans_in_can_team = all_can_players_in_can_teams + other_nats_in_can_teams
non_americans_in_amer_teams = all_can_players_in_amer_teams + other_nats_in_amer_teams

In [115]:
amer_contingency_table = np.array([[all_amer_players_in_can_teams, non_americans_in_can_team],
                                 [all_amer_players_in_amer_teams, non_americans_in_amer_teams]])

## Running Test

In [116]:
amer_chi2, amer_p_value, amer_deg_of_freedom, amer_expected = chi2_contingency(amer_contingency_table)

In [117]:
amer_p_value

0.5625534759123411

## TEMPORARY CONCLUSION 

There are more canadians in canadian teams, but not more americans in american teams. In other words, canadians stay local more than americans do. 

# Conclsion

The only conclusion from this notebook that we can make is that there are more canadian players playing in canada than in the United States. There was nearly significance for a country preference for Canadian, American, and other nationality players and no significance for American players playing in either country. 

In other words, Canadian stay local more than americans do. 

A piece to this puzzle may be the currencies that the two countries use: American dollars versus Canadian dollars. The salaries are standardized to American dollars and the players can choose which currency they want to be paid in. Since the American dollar is more valuable than Canadian dollars, depending on the exchange rate, Canadian teams will earn less than American teams will (https://www.quora.com/Do-all-NHL-players-regardless-of-whether-they-play-for-a-Canadian-team-receive-their-salaries-in-U-S-Currency).

The key piece to the puzzle here, though, is taxes, taxes, taxes. All players pay national taxes and some have to pay state/province taxes on top of that amount. This 