# Initial work  

## Packages/library import 

In [198]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

## Data Import

In [243]:
s_path_data = "dummy_co2_data.csv"

data = (
    pd.read_csv(s_path_data, index_col=0)
    .reset_index()
    .assign(Date=lambda df: pd.to_datetime(df.Date))
)

data.head()

Unnamed: 0,Date,CO2_Consumption
0,2023-01-01,3864.901425
1,2023-01-02,3995.852071
2,2023-01-03,4019.430656
3,2023-01-04,4045.690896
4,2023-01-05,3992.975399


In [200]:
fig = px.scatter(data, x="Date", y="CO2_Consumption", title="CO2 consumption over time")
median = data.CO2_Consumption.median()
mean = data.CO2_Consumption.mean()
x_coord = data["Date"].max()
y_coord = data["CO2_Consumption"].max()
fig.add_hline(y=mean, line_dash="dash", line_color="rgba(0, 128, 0, 0.5)", name="Mean")
fig.add_hline(
    y=median, line_dash="dot", line_color="rgba(255, 0, 0, 0.5)", name="Median"
)
fig.add_annotation(
    x=x_coord,
    y=y_coord,
    text=f"Mean: {mean:.2f}",
    showarrow=False,
    xanchor="right",
    yanchor="top",
)
fig.add_annotation(
    x=x_coord,
    y=y_coord,
    text=f"Median: {median:.2f}",
    showarrow=False,
    xanchor="right",
    yanchor="bottom",
)
fig.update_layout(showlegend=True)
fig.show()

#### On remarque une périodicité dans les données : cela peut correspondre aux week-ends où l'entreprise est peut-être moins active. Il y a trois valeurs aberrantes : soit des outliers à exclure (erreur de mesure,...), soit les utilisations anormales en question que l'on doit détecter.

In [201]:
fig = boxplot = px.box(
    data, y="CO2_Consumption", points="all", title="Box plot of CO2 consumption"
)
fig.update_layout(width=400, height=500)

In [202]:
data.isna().sum()
data.dropna().describe()

Unnamed: 0,Date,CO2_Consumption
count,117,117.0
mean,2023-03-01 00:00:00,3945.863055
min,2023-01-01 00:00:00,2798.252465
25%,2023-01-30 00:00:00,3874.676347
50%,2023-03-01 00:00:00,3986.097469
75%,2023-03-30 00:00:00,4007.726512
max,2023-04-30 00:00:00,4941.209896
std,,187.134568


## Outliers

In [244]:
IQR = data.CO2_Consumption.quantile(0.75) - data.CO2_Consumption.quantile(0.25)
lower_bound = data.CO2_Consumption.quantile(0.25) - 1.5 * IQR
upper_bound = data.CO2_Consumption.quantile(0.75) + 1.5 * IQR

outliers = data[
    (data.CO2_Consumption < lower_bound) | (data.CO2_Consumption > upper_bound)
]
outliers

Unnamed: 0,Date,CO2_Consumption
14,2023-01-15,2798.252465
37,2023-02-07,4941.209896
55,2023-02-25,2877.938404


In [245]:
data = data[~data.CO2_Consumption.isin(outliers.CO2_Consumption)].dropna()
data.describe()

Unnamed: 0,Date,CO2_Consumption
count,114,114.0
mean,2023-03-01 14:56:50.526315776,3956.566462
min,2023-01-01 00:00:00,3792.601593
25%,2023-01-30 06:00:00,3880.365683
50%,2023-03-02 12:00:00,3986.139153
75%,2023-03-30 18:00:00,4007.609601
max,2023-04-30 00:00:00,4073.897263
std,,72.928355


### Plot without outliers

In [205]:
fig = px.scatter(
    data,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time (without outliers)",
)
fig.show()

## Implementation of K-means algorithm

### Capture trends in the time-series

In [206]:
X = data["CO2_Consumption"].values.reshape(-1, 1)

kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
label = kmeans.labels_
kmeans.cluster_centers_

array([[3999.59651722],
       [3850.94723505]])

In [207]:
filtered_label0 = data[label == 0]
filtered_label1 = data[label == 1]
filtered_label2 = data[label == 2]
filtered_label3 = data[label == 3] 

colors = px.colors.qualitative.Dark24[:4]

fig = px.scatter(
    data,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time (clustered)",
    color_continuous_scale=px.colors.sequential.Viridis,
)
fig.add_scatter(
    x=filtered_label0["Date"],
    y=filtered_label0.CO2_Consumption,
    mode="markers",
    marker=dict(color=colors[0]),
    name="Cluster 0",
)
fig.add_scatter(
    x=filtered_label1["Date"],
    y=filtered_label1.CO2_Consumption,
    mode="markers",
    marker=dict(color=colors[1]),
    name="Cluster 1",
)
fig.add_scatter(
    x=filtered_label2["Date"],
    y=filtered_label2.CO2_Consumption,
    mode="markers",
    marker=dict(color=colors[2]),
    name="Cluster 2",
)
fig.add_scatter(
    x=filtered_label3["Date"],
    y=filtered_label3.CO2_Consumption,
    mode="markers",
    marker=dict(color=colors[3]),
    name="Cluster 3",
)

fig.update_layout(legend_title_text="Clusters", showlegend=True)
fig.show()

### Use of the clusters to implement an outlier detection

In [208]:
low_days = data[label == 1]
high_days = data[label == 0]

In [225]:
X = low_days.CO2_Consumption.values.reshape(-1, 1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the LocalOutlierFactor model for outlier detection
lof_outlier = LocalOutlierFactor(n_neighbors=15)

# Fit the model to the data and predict the outlier scores for each data point
outlier_scores = lof_outlier.fit_predict(X_scaled)

# Identify the outlier data points
outlier_indices = outlier_scores == -1
print("Outlier indices:", outlier_indices)

Outlier indices: [False  True False  True  True False False False False False False False
 False  True False False False False False  True False False False False
 False False False False False False False False False]


In [226]:
low_days[outlier_indices]
fig = px.scatter(
    low_days,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time for low-consumption days (outliers detected)",
)
fig.add_scatter(
    x=low_days[outlier_indices]["Date"],
    y=low_days[outlier_indices].CO2_Consumption,
    mode="markers",
    marker=dict(color="red"),
    name="Outliers",
)
fig.show()

In [223]:
X = high_days.CO2_Consumption.values.reshape(-1, 1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the LocalOutlierFactor model for outlier detection
lof_outlier = LocalOutlierFactor(n_neighbors=50, contamination=0.1)
# Fit the model to the data and predict
# the outlier scores for each data point
outlier_scores = lof_outlier.fit_predict(X_scaled)

# Identify the outlier data points
outlier_indices = outlier_scores == -1
print("Outlier indices:", outlier_indices)

Outlier indices: [False False  True False False False False False False False False False
 False False False False False False False False False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False  True False False  True False False False False False False
 False False False False False False False False False False False  True
 False False False  True  True False False False False]


In [224]:
high_days[outlier_indices]
fig = px.scatter(
    high_days,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time for normal-consumption days (outliers detected)",
)
fig.add_scatter(
    x=high_days[outlier_indices]["Date"],
    y=high_days[outlier_indices].CO2_Consumption,
    mode="markers",
    marker=dict(color="red"),
    name="Outliers",
)
fig.show()

## With a Novelty algorithm 

### Creation of sample data to test the algorithm

In [246]:
sample = data.sample(frac=0.3)
start = data["Date"].iloc[-1] + pd.Timedelta(days=1)
sample.index = pd.date_range(start=start, periods=sample.shape[0], freq="D")
sample["Date"] = sample.index
sample.set_index("Date", inplace=True)
data.set_index("Date", inplace=True)
data_index = pd.to_datetime(data.index)

df = pd.concat([data, sample], axis=0).sort_index()

In [247]:
df.reset_index(inplace=True)
fig = px.scatter(
    df,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time (with new values)",
)
fig.add_scatter(
    x=sample.index,
    y=sample.CO2_Consumption,
    mode="markers",
    marker=dict(color="red"),
    name="New values",
)
fig.show()

In [248]:
X = df["CO2_Consumption"]
df.set_index("Date", inplace=True)
df.index = pd.to_datetime(df.index)
start = pd.to_datetime(start)

X_train = X[df.index < start].values.reshape(-1, 1)
X_test = X[df.index >= start].values.reshape(-1, 1)

lof_novelty = LocalOutlierFactor(n_neighbors=40, novelty=True, contamination=0.1)
lof_novelty.fit(X_train)

prediction = lof_novelty.predict(X_test)
print("Novelty detection for new data point:", prediction)

Novelty detection for new data point: [ 1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1  1
  1  1  1  1  1  1  1  1  1  1]


In [249]:
df.reset_index(inplace=True)
fig = px.scatter(
    df,
    x="Date",
    y="CO2_Consumption",
    title="CO2 consumption over time (with new values)",
)
fig.add_scatter(
    x=sample.index[prediction == -1],
    y=sample.CO2_Consumption[prediction == -1],
    mode="markers",
    marker=dict(color="red"),
    name="Outliers",
)
fig.add_vline(x=start, line_dash="dash", line_color="rgba(0, 0, 0, 0.5)")
fig.show()

This prediction shows that the model is able to identify irregular data out of a testing set. 

I could better the prediction by focusing on the low-consumption and normal-consumption days and implementing the model on each subset. Of course, my model would be more effective with more data!