## Import Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

## Get the data

In [None]:
df = pd.read_csv('../../../datasets/parte2/train.csv')

#### Columns

In [None]:
df.columns

#### Shape

In [None]:
if 'Injeção na rede (kWh)' in df:
    df['Injeção na rede (kWh)'].value_counts()

In [None]:
df.shape

In [None]:
df.head()

#### Dataset info

In [None]:
print(df.info())

#### Unique values

In [None]:
print(df["dt"].unique())
print(len(df["dt"].unique()))

print(df["city_name"].unique())
print(df["weather_description"].unique())

print(df["rain_1h"].unique())
print(len(df["rain_1h"].unique()))

#### Statistical dispersion

In [None]:
print(df.describe())


#### Missing values

In [None]:
print(df.isna().sum())

#### Filling missing value with 'None' in 'Injeção na rede (kWh)' column

In [None]:
df['Injeção na rede (kWh)'] = df['Injeção na rede (kWh)'].fillna("None")

In [None]:
print(df["Injeção na rede (kWh)"].unique())

In [None]:
print(df.isna().sum())

## Univariate Analysis

#### dt variable

In [None]:
df['dt'].value_counts()

## Multivariate Analysis

#### Relation between all variables

In [None]:
#sns.pairplot(df, hue='injection ')

#### Features mean by class

In [None]:
#df.groupby(by=['Injeção na rede (kWh)']).mean(numeric_only=True)

#### Outliers

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(15, 15))
fig.suptitle("Boxplots")

sns.boxplot(y=df["pressure"], ax=axs[0,0])
sns.boxplot(y=df["humidity"], ax=axs[0,1])
sns.boxplot(y=df["clouds_all"], ax=axs[0,2])
sns.boxplot(y=df["hour"], ax=axs[0,3])
sns.boxplot(y=df["temp"], ax=axs[1,0])
sns.boxplot(y=df["feels_like"], ax=axs[1,1])
sns.boxplot(y=df["temp_min"], ax=axs[1,2])
sns.boxplot(y=df["temp_max"], ax=axs[1,3])
sns.boxplot(y=df["wind_speed"], ax=axs[2,0])
sns.boxplot(y=df["rain_1h"], ax=axs[2,1])
sns.boxplot(y=df["normal"], ax=axs[2,2])
sns.boxplot(y=df["economic_schedule"], ax=axs[2,3])
sns.boxplot(y=df["self-consumption"], ax=axs[3,0])

#### Statistical dispersion

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(12, 15))
fig.suptitle("Histograms")

sns.histplot(df["dt"], ax=axs[0,0], kde=True)
sns.histplot(df["pressure"], ax=axs[0,1], kde=True)
sns.histplot(df["humidity"], ax=axs[0,2], kde=True)
sns.histplot(df["clouds_all"], ax=axs[0,3], kde=True)
sns.histplot(df["hour"], ax=axs[1,0], kde=True)
sns.histplot(df["temp"], ax=axs[1,1], kde=True)
sns.histplot(df["feels_like"], ax=axs[1,2], kde=True)
sns.histplot(df["temp_min"], ax=axs[1,3], kde=True)
sns.histplot(df["temp_max"], ax=axs[2,0], kde=True)
sns.histplot(df["wind_speed"], ax=axs[2,1], kde=True)
sns.histplot(df["rain_1h"], ax=axs[2,2], kde=True)
sns.histplot(df["normal"], ax=axs[2,3], kde=True)
sns.histplot(df["economic_schedule"], ax=axs[3,0], kde=True)
sns.histplot(df["self-consumption"], ax=axs[3,1], kde=True)

In [None]:
injection_count = df['Injeção na rede (kWh)'].value_counts()
sns.set_style('darkgrid')
sns.barplot(x=injection_count.index, y=injection_count.values, palette='BrBG')
plt.title('Frequency Distribution of Injection')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('RainToday', fontsize=12)
plt.show()

In [None]:
label_encoder = LabelEncoder()

df['Injeção na rede (kWh)'] = label_encoder.fit_transform(df['Injeção na rede (kWh)'])

df['Data'] = pd.to_datetime(df['Data'], format="%Y-%m-%d", utc=True)
df['year'] = df['Data'].dt.year
df['month'] = df['Data'].dt.month
df['day'] = df['Data'].dt.day
df.drop(['Data'], inplace=True, axis=1)

In [None]:
corr_matrix = df.corr(numeric_only=True)
f, ax = plt.subplots(figsize=(20, 16))
sns.heatmap(corr_matrix, vmin=-1, vmax=1, square=True, annot=True)

In [None]:
print(corr_matrix["Injeção na rede (kWh)"])