# COGS 108 - Final Project

## Important

- ONE, and only one, member of your group should upload this notebook to TritonED. 
- Each member of the group will receive the same grade on this assignment. 
- Keep the file name the same: submit the file 'FinalProject.ipynb'.
- Only upload the .ipynb file to TED, do not upload any associted data. Make sure that for cells in which you want graders to see output that these cells have been executed.

## Group Members: Fill in the Student IDs of each group member here

Replace the lines below to list each persons full student ID, ucsd email and full name.

- A12814729
- A11983710
- A91097653
- A13497348
- A12433857
- A11774341



Start your project here.

## Introduction and Background

## Data Description

In [None]:
#The imports for this project
import pandas as pd
import numpy as np
import seaborn as sp
import matplotlib as mplot
from collections import defaultdict
from scipy import stats
import patsy
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
from sklearn import preprocessing

## Data Cleaning

In [None]:
weeksalesdb = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv' , dtype = {'Store': int})
features = pd.read_csv('features.csv')

In [None]:
features.drop(columns = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'], inplace = True)

In [None]:
#converts weeksalesdb rows into dict and merging departments
agg_sales = defaultdict(int)
for ind, sale in weeksalesdb.iterrows():
    agg_sales[str(sale['Store'])+'.'+sale['Date']] += sale['Weekly_Sales']

In [None]:
#converts back into db with updated sales value
storesales = pd.DataFrame(columns=['Store','Date','Weekly_Sales'])

for key, value in agg_sales.items():
    store, date = key.split('.')
    newrow = [store, date, value]
    storesales.loc[len(storesales)] = newrow

In [None]:
#function to convert date to integer
def convert_date(date):
    date = date.strip()
    date = date.replace('-','')
    date = date.strip()
    return int(date)

#convert date columns
storesales['Date'] = storesales['Date'].apply(convert_date)
features['Date'] = features['Date'].apply(convert_date)

In [None]:
#convert store column to int (was str)
storesales['Store'] = pd.to_numeric(storesales['Store'])
features.head(10)

In [None]:
#exported csv files
storesales.to_csv('storesales.csv')
features.to_csv('features2.csv')

In [None]:
#merged two dataframes together on the store and date column
merged = pd.merge(storesales, features, on =['Store', 'Date'], how = 'outer')

In [None]:
#removed rows where unemployment data was empty
merged.dropna(subset = ['Unemployment'], inplace = True)

In [None]:
#export cleaned features
merged.to_csv('features_clean.csv', index = False)

In [None]:
merged.head(10)

In [None]:
#checked for outliers in the weekly sales column, most of these are on or near holidays.
outliers = merged[merged['Weekly_Sales'] > merged['Weekly_Sales'].mean() + 3 * merged['Weekly_Sales'].std()]
outliers

## Data Visualization

In [None]:
features.plot.scatter(x='Temperature', y='Unemployment')

In [None]:
features['Temperature'].plot.hist()

## Data Analysis and Results

In [None]:
features = pd.read_csv('features_clean.csv')

In [None]:
out_sales, pred_sales = patsy.dmatrices('Unemployment ~ Weekly_Sales', features)
mod_sales = sm.OLS(out_sales, pred_sales)
res_sales = mod_sales.fit()

out_temp, pred_temp = patsy.dmatrices('Unemployment ~ Temperature', features)
mod_temp = sm.OLS(out_temp, pred_temp)
res_temp = mod_temp.fit()

out_fuel, pred_fuel = patsy.dmatrices('Unemployment ~ Fuel_Price', features)
mod_fuel = sm.OLS(out_fuel, pred_fuel)
res_fuel = mod_fuel.fit()

out_cpi, pred_cpi = patsy.dmatrices('Unemployment ~ CPI', features)
mod_cpi = sm.OLS(out_cpi, pred_cpi)
res_cpi = mod_cpi.fit()

In [None]:
print(res_sales.summary(), res_temp.summary(), res_fuel.summary(), res_cpi.summary())

In [None]:
out1, pred1 = patsy.dmatrices('Unemployment ~ Weekly_Sales + CPI', features)
mod1 = sm.OLS(out1, pred1)
res1 = mod1.fit()

In [None]:
print(res1.summary())

In [None]:
out2, pred2 = patsy.dmatrices('Unemployment ~ Weekly_Sales + CPI + Fuel_Price + Temperature', features)
mod2 = sm.OLS(out2, pred2)
res2 = mod2.fit()

In [None]:
print(res2.summary())

In [None]:
def standardizevalue(self, df, label):
    df = df.copy(deep=True)
    series = df.loc[:, label]
    avg = series.mean()
    stdv = series.std()
    series_standardized = (series - avg)/stdv
    return series_standardized

In [None]:
#columns we want to standardize
numericcolumns = features[['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']]
#get the column names
names = numericcolumns.columns
#create scaler
scaler = preprocessing.StandardScaler()
#apply transformation
normaled = scaler.fit_transform(numericcolumns)
normaled = pd.DataFrame(normaled, columns=names)
#delete the columns to be replaced with new values
features_normal = features.drop(labels = names, axis = 'columns')
#add in the columns from the normalized df
features_normal[names] = normaled
#rearrange columns to be like original features
features_normal = features_normal[features.columns]
#export csv file
features_normal.to_csv('features_normal.csv', index = False)

features_normal

## Privacy/Ethics Concerns

## Conclusions and Discussion