In [None]:
import pandas as pd
from pandas_profiling import ProfileReport as pr
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse
import requests
import json
from sklearn.linear_model import LinearRegression

In [None]:
train_data = pd.read_csv("Challenge1_train_data.csv")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.date.min(),train_data.date.max()

In [None]:
test_date = parse(train_data.date.max()) + relativedelta(days=-14)
str(test_date)

In [None]:
profile = pr(train_data, title="ATD Data Analysis", explorative=True)
profile.to_file("atd_report.html")

In [None]:
train_data[train_data['retail_price']==0]

In [None]:
len(train_data)

In [None]:
###here we are getting the weather data from the api into a dataframe for all the cities


weather_sacramento_url = f'https://archive-api.open-meteo.com/v1/era5?latitude=38.5816&longitude=-121.4944&start_date=2020-09-20&end_date=2022-09-19&timezone=GMT&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max' 
weather_bakersfield_url= f'https://archive-api.open-meteo.com/v1/era5?latitude=35.3733&longitude=-119.0187&start_date=2020-09-20&end_date=2022-09-19&timezone=GMT&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max'
weather_sanjose_url = f'https://archive-api.open-meteo.com/v1/era5?latitude=37.3387&longitude=-121.8853&start_date=2020-09-20&end_date=2022-09-19&timezone=GMT&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max'
weather_oakland_url = f'https://archive-api.open-meteo.com/v1/era5?latitude=37.8044&longitude=-122.2712&start_date=2020-09-20&end_date=2022-09-19&timezone=GMT&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max'


def get_weather_data(url,city_name):
    weather_resp = requests.get(url)
    a=weather_resp.json()
    weather=pd.DataFrame.from_dict(a.get('daily'))
    weather['city_name']=city_name
    return weather

weather_sacramento=get_weather_data(weather_sacramento_url,'SACRAMENTO')
print(weather_sacramento.info())

weather_bakersfield=get_weather_data(weather_bakersfield_url,'BAKERSFIELD')
print(weather_bakersfield.info())
weather_sanjose=get_weather_data(weather_sanjose_url,'SAN JOSE')
print(weather_sanjose.info())
weather_oakland= get_weather_data(weather_oakland_url,'OAKLAND')
print(weather_oakland.info())



In [None]:
weather_data = pd.concat([weather_sacramento, weather_bakersfield,weather_sanjose,weather_oakland],ignore_index=True)
weather_data.head()

In [None]:
train_data = train_data.merge(weather_data, how='left', left_on=['date', 'dc_name'], right_on=['time', 'city_name']).drop(columns=['city_name','time'])
train_data.head()

## Exploratory Data Anlaysis

In [None]:
zero_data = train_data[train_data['retail_price']==0]
grouped_size_data = zero_data.groupby(['dc_name','size_code'])['date'].count()
grouped_size_data

In [None]:
grouped_dc_data = zero_data.groupby(['dc_name'])['date'].count()
grouped_dc_data

## Data Classification

In [None]:
train_data['Key'] = train_data['dc_name'] +'|' +train_data['size_code'].astype(str)

In [None]:
train_data['date'] = pd.to_datetime(train_data['date'])
train_data.head()

In [None]:
train_df = train_data[train_data['date']<str(test_date)]

In [None]:
cv_data = train_df.groupby(['Key']).agg(average=('total_tires','mean'),
                                                    sd=('total_tires','std')).reset_index()
#cv_data['key_adi']= cv_data['dc_name']+'|'+cv_data['size_code'].astype(str)

In [None]:
## Calculating CV_squared

cv_data['cv_sqr'] = (cv_data['sd']/cv_data['average'])**2
cv_data

In [None]:
#train_df['key_adi'] = train_df['dc_name']+'|'+train_df['size_code'].astype(str)

non_zero_df = train_df[train_df['total_tires']>0]
non_zero_df = non_zero_df.dropna(subset=('total_tires'))
tire_by_date= non_zero_df.groupby(['Key','date']).agg(count=('Key','count')).reset_index()

In [None]:
skus=tire_by_date.Key.value_counts()
skus

In [None]:
new_df= pd.DataFrame()
for i in range(len(skus.index)):
    a= tire_by_date[tire_by_date['Key']==skus.index[i]]
    a['previous_date']=a['date'].shift(1)
    new_df=pd.concat([new_df,a],axis=0)

In [None]:
new_df['duration']=new_df['date']- new_df['previous_date']

In [None]:
new_df.head()

In [None]:
new_df['Duration']=new_df['duration'].astype(str).str.replace('days','')
new_df['Duration']=pd.to_numeric(new_df['Duration'],errors='coerce')
ADI = new_df.groupby('Key').agg(ADI = ('Duration','mean')).reset_index()
ADI

In [None]:
adi_cv=pd.merge(ADI,cv_data)
adi_cv

In [None]:
def category(df):
    a=0
    
    if((df['ADI']<=1.34) & (df['cv_sqr']<=0.49)):
        a='Smooth'
    if((df['ADI']>=1.34) & (df['cv_sqr']>=0.49)):  
        a='Lumpy'
    if((df['ADI']<1.34) & (df['cv_sqr']>0.49)):
        a='Erratic'
    if((df['ADI']>1.34) & (df['cv_sqr']<0.49)):
        a='Intermittent'
    return a

In [None]:
adi_cv['category']=adi_cv.apply(category,axis=1)
adi_cv[['Key','category']]

In [None]:
adi_cv.category.value_counts()


In [None]:
train_data = train_data.merge(adi_cv[['Key','category']],on=['Key'],how='left')

In [None]:
train_data

### Handling Missing Values

In [None]:
train_data[train_data['retail_price'].isna()]

In [None]:
# Impute the missing values for the retail_price
df_sacramento = train_data[train_data['dc_name']=='SACRAMENTO']

sac_sizecode = df_sacramento[df_sacramento['retail_price'].isna()]['size_code'].tolist()

for code in sac_sizecode : 
    # Get the mean value
    mean_value = df_sacramento[(df_sacramento['size_code'] == code) & ((~df_sacramento['retail_price'].isna()) & (df_sacramento['retail_price'] != 0))]['retail_price'].mean()
    df_sacramento.loc[(df_sacramento['size_code'] == code) & (df_sacramento['retail_price'].isna()), 'retail_price'] = mean_value

In [None]:
# Putting the sacramento data into the main dataframe after imputing
indexes_to_drop = train_data[train_data['dc_name']=='SACRAMENTO'].index
train_data.drop(index= indexes_to_drop, inplace = True)
train_data = pd.concat([
    train_data,df_sacramento
], ignore_index= True)

In [None]:
train_data['day_name'] = train_data['date'].dt.day_name()

In [None]:
train_data.info()

## Univariate Models

In [None]:
uni_train_data = train_data[['Key','dc_name','size_code','date','total_tires']].sort_values(['Key','dc_name','size_code','date'])
uni_train_data

In [None]:
uni_train_data['date'] = pd.to_datetime(uni_train_data['date'])
uni_train_data.head()

In [None]:
train_df = uni_train_data[uni_train_data['date']<str(test_date)][['date','total_tires']]
test_df = uni_train_data[uni_train_data['date']>=str(test_date)][['date','total_tires']]

In [None]:
def model_training(sliced_data):
    
    train_df=train_df.reset_index()
    test_df=test_df.reset_index()
    x_train = train_df['date'].values.reshape(-1, 1)
    x_train = x_train.astype(float)
    y_train = train_df['total_tires'].values
    
    x_test = test_df['date'].values.reshape(-1, 1)
    x_test = x_test.astype(float)
    t_test = test_df['total_tires'].values
    
    
    
    regressor = LinearRegression() 
        
    regressor.fit(x_train,y_train)
        

    y_test = regressor.predict(x_test)
    test_df['Forecast'] = y_test.round()
    test_df['Model'] = 'Linear_Regression'
    return test_df
    

In [None]:
result_df = uni_train_data.groupby('Key').apply(model_training).reset_index()
result_df = uni_train_data.groupby('Key').apply(model_training).reset_index()
result_df = result_df[['Key','date','total_tires','Forecast','Model']]

result_df.reset_index()

In [None]:
result_df.date.unique()