## $\color{SkyBlue}{\text{Weather Analysis}}$
#### $\color{SkyBlue}{\text{Roman Lynch}}$


In [1]:
import numpy as np
import pandas as pd

$\rule{27cm}{0.4pt}$
### K-Means Algorithm

In [16]:
#--------------------------------------------------------------------------------#
def dist(x1, x2):
    return np.sqrt(np.sum(x1-x2)**2)
#--------------------------------------------------------------------------------#

In [17]:
#--------------------------------------------------------------------------------#
def kMeans(df, k=4, tol=0.05): 
    """
    Usage: input 
        df=data frame, 
        k=# of clusters
        tol=tolerance for L_2 convergance check on centroids
    """    
    #---------------------------------------#
    #-- Initialize --#
    clusters = np.zeros(len(df))
    centroids = df.sample(k).values
    mean_error = np.inf
    converged = False
    
    num_iterations = 0
    
    #---------------------------------------#
    #-- LOOP UNTIL CONVERGENCE --#
    while not(converged):
        
        #-- Cluster Assignment --#
        for index, data in enumerate(df.values):
            distances = [dist(data, centroid) for centroid in centroids]
            clusters[index] = np.argmin(distances)
        
        #-- Update Centroids --#
        updated_centroids = np.empty((k, df.shape[1]))
        for index in range(k):
            clust_data = df[clusters == index]
            centroid = clust_data.mean(axis=0)
            updated_centroids[index, :] = centroid
            
        #-- Calculate Meanerror --#
        errors = []
        for data, cluster in zip(df.values, clusters):
            centroid = updated_centroids[int(cluster)]
            errors.append(dist(data, centroid)**2)
        
        error_arr = np.array(errors)
        
        #-- Calculate Reconstruction Error --#
        rec_err = np.sum(error_arr)/len(df)
        
        #-- Check for Convergence --#
        if (abs(rec_err - mean_error) < tol):
            converged = True
#             print('Converged at iteration {} with a change of {}'.format(num_iterations, abs(rec_err - mean_error)))
        
        # Update Values
        centroids = updated_centroids
        mean_error = rec_err
        num_iterations = num_iterations + 1
        
    #---------------------------------------#
    return centroids, clusters, mean_error
#--------------------------------------------------------------------------------#

$\rule{27cm}{0.4pt}$ 
### Dataframe Composition

In [49]:
# Read in CSV files
df_weather = pd.read_csv("data/city_temperature.csv")
df_rankings = pd.read_csv("data/rankings.csv")
df_teams = pd.read_csv("data/team.csv")

# Only take important columns
df_weather = df_weather[['City', 'Year', 'AvgTemperature']]
df_rankings = df_rankings[['Team', 'Rank', 'Year']]
df_teams = df_teams[['full_name', 'abbreviation', 'city']]

#temp = df_weather[df_weather['City'] == 'Denver']
#temp2 = temp[temp['Year'] == 2018]

#print(temp2['AvgTemperature'].head(5))
# Clean "*" from all teams in df_rankings
for i in range(len(df_rankings)):
    if (df_rankings["Team"].iloc[i].find('*')):
        df_rankings["Team"].iloc[i] = df_rankings["Team"].iloc[i].replace('*', '')

#print(df_weather.head(5))
#print(df_rankings.head(5))
#print(df_teams.head(5))

#world_cities = df_weather['City'].unique()
nba_cities = set(df_teams['city'].unique())

df_weather = df_weather[nba_cities.find(df_weather['City'])]
df_master = pd.merge(df_rankings,df_teams, left_on="Team", right_on="full_name")
#df_master = pd.merge(df_master,df_weather, left_on=['Year', 'city'], right_on=['Year', 'City'])

print(df_master.head(5))




  df_weather = pd.read_csv("data/city_temperature.csv")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rankings["Team"].iloc[i] = df_rankings["Team"].iloc[i].replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rankings["Team"].iloc[i] = df_rankings["Team"].iloc[i].replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rankings["Team"].iloc[i] = df_rankings["Team"].iloc[i].replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docum

AttributeError: 'set' object has no attribute 'find'