## TITLE: REGRESSION PROJECT

In [10]:
#%pip install matplotlib

## IMPORTING LIBRARIES

In [24]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 100)

import matplotlib as mpl
import matplotlib.dates as mdates
import seaborn as sns
from dotenv import dotenv_values 

from datetime import datetime

import warnings

import pyodbc
warnings.filterwarnings('ignore') 

## LOADING DATASET

In [12]:
#load environment values 
env_var = dotenv_values('.env')

#Get the values for the credentials you set in the '.env' file
database=env_var.get("DATABASE")
server=env_var.get("SERVER")
username=env_var.get("USERNAME")
password=env_var.get("PASSWORD")

connection_string=f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

In [13]:
connection=pyodbc.connect(connection_string)

In [14]:
#harmonising the date format "%Y-%m-%d"
def convert_to_date(w):
    return datetime.strptime(w,"%Y-%m-%d")

#Data from Server
#Table 1
query1="Select * from dbo.oil"
oil=pd.read_sql(query1,connection,parse_dates= ["date"]) #, date_parser= convert_to_date)
#Table 2
query2="Select * from dbo.holidays_events"
holidays=pd.read_sql(query2,connection,parse_dates= ["date"])#, date_parser= convert_to_date)
#Table 3
query3="Select * from dbo.stores"
stores=pd.read_sql(query3,connection)

#From onedrive
test = pd.read_csv('test.csv',parse_dates= ["date"], date_parser= convert_to_date)
sample_submission = pd.read_csv('sample_submission.csv')

#from github repo
train = pd.read_csv('train.csv',parse_dates= ["date"], date_parser= convert_to_date)
transactions = pd.read_csv('transactions.csv',parse_dates= ["date"], date_parser= convert_to_date)



## DATA OVERVIEW

In [15]:
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.139999
2,2013-01-03,92.970001
3,2013-01-04,93.120003
4,2013-01-07,93.199997
...,...,...
1213,2017-08-25,47.650002
1214,2017-08-28,46.400002
1215,2017-08-29,46.459999
1216,2017-08-30,45.959999


In [16]:
holidays

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [17]:
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [18]:
test


Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [19]:
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [20]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [21]:
sample_submission

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0
...,...,...
28507,3029395,0.0
28508,3029396,0.0
28509,3029397,0.0
28510,3029398,0.0


In [27]:
##putting the dataframes in a list to make it easy to loop through
dataframes= [train, test, transactions, oil, holidays, stores]

for df in dataframes:
    print(df.info(), "\n\n", df.isna().sum(), "\n\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB
None 

 id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromot