In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.rcParams['figure.dpi'] = 600
plt.rcParams['savefig.dpi'] = 600

sns.set_theme(style='darkgrid', palette='viridis', rc={
              "figure.dpi": 600, 'savefig.dpi': 600, 'figure.figsize': (11.7, 8.27)})

In [3]:
df = pd.read_csv('../data/merged_output.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28138584 entries, 0 to 28138583
Data columns (total 13 columns):
 #   Column       Dtype  
---  ------       -----  
 0   time         float64
 1   level        float64
 2   y            float64
 3   x            float64
 4   air          float64
 5   shum         float64
 6   omega        float64
 7   hgt          float64
 8   uwnd         float64
 9   vwnd         float64
 10  source_file  object 
 11  id1          float64
 12  id2          float64
dtypes: float64(12), object(1)
memory usage: 2.7+ GB


In [4]:
df.head()

Unnamed: 0,time,level,y,x,air,shum,omega,hgt,uwnd,vwnd,source_file,id1,id2
0,1849800.0,1000.0,2791818.0,5161615.0,288.65955,0.004022,-0.071466,107.12987,-1.188751,0.414642,all_10.0_20110110.csv,,
1,1849800.0,1000.0,2791818.0,5194078.0,288.20642,0.004172,-0.118341,106.12987,-1.720001,0.539642,all_10.0_20110110.csv,,
2,1849800.0,1000.0,2791818.0,5226541.0,287.7533,0.004512,-0.13006,105.62987,-1.126251,0.180267,all_10.0_20110110.csv,,
3,1849800.0,1000.0,2791818.0,5259004.0,286.97205,0.004742,-0.110529,104.62987,-1.141876,0.430267,all_10.0_20110110.csv,,
4,1849800.0,1000.0,2791818.0,5291467.0,287.05017,0.005192,-0.087091,103.12987,-0.032501,2.477142,all_10.0_20110110.csv,,


In [5]:
# Checking for missing values
df.isnull().sum()

time                  0
level                 0
y                     0
x                     0
air                   0
shum                  0
omega                 0
hgt                   0
uwnd                  0
vwnd                  0
source_file           0
id1            28138584
id2            28138584
dtype: int64

In [6]:
df.drop(columns=['id1', 'id2'], inplace=True)

In [7]:
# Checking for duplicates
df.duplicated().sum()

0

In [8]:
# Checking for columns with constant values
df.columns[df.nunique() == 1]

Index([], dtype='object')

In [9]:
# Extract both numbers from the filename
df['fire_id_1'] = df.source_file.str.extract(r'all_(\d+\.\d+)_')
df['fire_id_2'] = df.source_file.str.extract(r'_(\d+)\.csv$')

df['fire_id_1'] = df['fire_id_1'].astype(float)
df['fire_id_2'] = df['fire_id_2'].astype(int)

df.drop(columns='source_file', inplace=True)

df.head()

Unnamed: 0,time,level,y,x,air,shum,omega,hgt,uwnd,vwnd,fire_id_1,fire_id_2
0,1849800.0,1000.0,2791818.0,5161615.0,288.65955,0.004022,-0.071466,107.12987,-1.188751,0.414642,10.0,20110110
1,1849800.0,1000.0,2791818.0,5194078.0,288.20642,0.004172,-0.118341,106.12987,-1.720001,0.539642,10.0,20110110
2,1849800.0,1000.0,2791818.0,5226541.0,287.7533,0.004512,-0.13006,105.62987,-1.126251,0.180267,10.0,20110110
3,1849800.0,1000.0,2791818.0,5259004.0,286.97205,0.004742,-0.110529,104.62987,-1.141876,0.430267,10.0,20110110
4,1849800.0,1000.0,2791818.0,5291467.0,287.05017,0.005192,-0.087091,103.12987,-0.032501,2.477142,10.0,20110110


In [10]:
# Checking columnar uniqueness of fire_id_1 and fire_id_2
df[['fire_id_1', 'fire_id_2']].nunique()

fire_id_1    50681
fire_id_2     6308
dtype: int64

In [12]:
# Converting fire id 2 from YYYYMMDD format to datetime
# Example: 20110110 -> 2011-01-10
df['fire_id_2'] = pd.to_datetime(df['fire_id_2'].astype(str), format='%Y%m%d')
df.head()

Unnamed: 0,time,level,y,x,air,shum,omega,hgt,uwnd,vwnd,fire_id_1,fire_id_2
0,1849800.0,1000.0,2791818.0,5161615.0,288.65955,0.004022,-0.071466,107.12987,-1.188751,0.414642,10.0,2011-01-10
1,1849800.0,1000.0,2791818.0,5194078.0,288.20642,0.004172,-0.118341,106.12987,-1.720001,0.539642,10.0,2011-01-10
2,1849800.0,1000.0,2791818.0,5226541.0,287.7533,0.004512,-0.13006,105.62987,-1.126251,0.180267,10.0,2011-01-10
3,1849800.0,1000.0,2791818.0,5259004.0,286.97205,0.004742,-0.110529,104.62987,-1.141876,0.430267,10.0,2011-01-10
4,1849800.0,1000.0,2791818.0,5291467.0,287.05017,0.005192,-0.087091,103.12987,-0.032501,2.477142,10.0,2011-01-10


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28138584 entries, 0 to 28138583
Data columns (total 12 columns):
 #   Column     Dtype         
---  ------     -----         
 0   time       float64       
 1   level      float64       
 2   y          float64       
 3   x          float64       
 4   air        float64       
 5   shum       float64       
 6   omega      float64       
 7   hgt        float64       
 8   uwnd       float64       
 9   vwnd       float64       
 10  fire_id_1  float64       
 11  fire_id_2  datetime64[ns]
dtypes: datetime64[ns](1), float64(11)
memory usage: 2.5 GB


In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


def detect_outliers(df, contamination=0.1, random_state=42, n_samples=100000):
    """
    Detect outliers in the dataset using Isolation Forest

    Parameters:
    -----------
    df : pandas DataFrame
        Input dataset
    contamination : float, default=0.1
        The proportion of outliers in the data set
    random_state : int, default=42
        Random state for reproducibility
    n_samples : int, default=100000
        Number of samples to use for training (for large datasets)

    Returns:
    --------
    DataFrame with outlier scores and binary labels
    """

    num_cols = df.select_dtypes(include=['float64']).columns

    if len(df) > n_samples:
        df_sample = df.sample(n=n_samples, random_state=random_state)
    else:
        df_sample = df.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_sample[num_cols])

    iso_forest = IsolationForest(
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1
    )

    iso_forest.fit(X_scaled)

    chunk_size = 1000000
    scores = []

    for i in range(0, len(df), chunk_size):
        chunk = df[num_cols].iloc[i:i + chunk_size]
        chunk_scaled = scaler.transform(chunk)
        chunk_scores = iso_forest.score_samples(chunk_scaled)
        scores.extend(chunk_scores)

    df['outlier_score'] = scores
    df['is_outlier'] = iso_forest.predict(scaler.transform(df[num_cols])) == -1

    return df, iso_forest, scaler


def main(df):

    df = df.copy()
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(df.mean())

    df_with_outliers, model, scaler = detect_outliers(
        df,
        contamination=0.1,
        n_samples=100000
    )

    print("\nOutlier Detection Summary:")
    print(f"Total samples: {len(df_with_outliers)}")
    print(f"Number of outliers: {df_with_outliers['is_outlier'].sum()}")
    print(f"Percentage of outliers: {
          (df_with_outliers['is_outlier'].sum() / len(df_with_outliers) * 100):.2f}%")

    outlier_stats = df_with_outliers.groupby('is_outlier').describe()

    return df_with_outliers, model, scaler, outlier_stats

In [18]:
df_with_outliers, model, scaler, outlier_stats = main(df)


Outlier Detection Summary:
Total samples: 28138584
Number of outliers: 2838938
Percentage of outliers: 10.09%


In [19]:
df_with_outliers.head()

Unnamed: 0,time,level,y,x,air,shum,omega,hgt,uwnd,vwnd,fire_id_1,fire_id_2,outlier_score,is_outlier
0,1849800.0,1000.0,2791818.0,5161615.0,288.65955,0.004022,-0.071466,107.12987,-1.188751,0.414642,10.0,2011-01-10,-0.521593,True
1,1849800.0,1000.0,2791818.0,5194078.0,288.20642,0.004172,-0.118341,106.12987,-1.720001,0.539642,10.0,2011-01-10,-0.52358,True
2,1849800.0,1000.0,2791818.0,5226541.0,287.7533,0.004512,-0.13006,105.62987,-1.126251,0.180267,10.0,2011-01-10,-0.520819,True
3,1849800.0,1000.0,2791818.0,5259004.0,286.97205,0.004742,-0.110529,104.62987,-1.141876,0.430267,10.0,2011-01-10,-0.526151,True
4,1849800.0,1000.0,2791818.0,5291467.0,287.05017,0.005192,-0.087091,103.12987,-0.032501,2.477142,10.0,2011-01-10,-0.528893,True


In [20]:
df_with_outliers['is_outlier'].value_counts()

is_outlier
False    25299646
True      2838938
Name: count, dtype: int64

In [21]:
df_with_outliers.to_csv('../data/merged_output_outliers.csv', index=False)

In [22]:
# Only saving the outliers column
df_with_outliers[['is_outlier']].to_csv(
    '../data/merged_output_outliers_column.csv', index=False)

In [23]:
outlier_stats

Unnamed: 0_level_0,time,time,time,time,time,time,time,time,level,level,...,fire_id_2,fire_id_2,outlier_score,outlier_score,outlier_score,outlier_score,outlier_score,outlier_score,outlier_score,outlier_score
Unnamed: 0_level_1,count,mean,min,25%,50%,75%,max,std,count,mean,...,max,std,count,mean,min,25%,50%,75%,max,std
is_outlier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
False,25299646.0,1865279.0,1771632.0,1812936.0,1863600.0,1915632.0,1962984.0,56243.174276,25299646.0,567.290857,...,2023-12-09 00:00:00,,25299646.0,-0.460333,-0.515631,-0.481011,-0.459029,-0.439231,-0.3927,0.026902
True,2838938.0,1871949.0,1771632.0,1810416.0,1873512.0,1933944.0,1962984.0,62962.112859,2838938.0,532.622322,...,2023-12-09 00:00:00,,2838938.0,-0.535828,-0.639845,-0.544622,-0.53113,-0.52234,-0.515631,0.017406
