In [None]:
pip install liac-arff

In [None]:
import os
import arff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

from datetime import datetime, timedelta
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

np.random.seed(42)

import warnings
warnings.filterwarnings("ignore")

# 1. Data Loading and Inspection

In [None]:
X = pd.read_csv("/kaggle/input/production-quality/data_X.csv")
Y = pd.read_csv("/kaggle/input/production-quality/data_Y.csv")
submission = pd.read_csv("/kaggle/input/production-quality/sample_submission.csv")

In [None]:
print("Data_X shape:", X.shape)
print("Data_Y shape:", Y.shape)
print("Sample Submission shape:", submission.shape)

In [None]:
print("Data_X columns:\n", X.columns)
print("Data_Y columns:\n", Y.columns)
print("Sample Submission columns:\n", submission.columns)

In [None]:
X.info()

In [None]:
Y.info()

In [None]:
submission.info()

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
submission.head()

In [None]:
missing_counts = X.isna().sum().to_frame(name='missing_counts')
print(missing_counts)

In [None]:
missing_counts = Y.isna().sum().to_frame(name='missing_counts')
print(missing_counts)

In [None]:
missing_counts = submission.isna().sum().to_frame(name='missing_counts')
print(missing_counts)

# 2. Data Cleaning

In [None]:
X_clean = X.copy()
Y_clean = Y.copy()
submission_clean = submission.copy()

In [None]:
numerical_cols = X_clean.select_dtypes(include=['int64', 'float64']).columns

In [None]:
plt.figure(figsize=(15, len(numerical_cols) * 4))

for i, col in enumerate(numerical_cols):
    plt.subplot(len(numerical_cols), 3, i*3 + 1)
    sns.histplot(X_clean[col], kde=True, color="dodgerblue")
    plt.title(f'Distribution of {col}')
    
    plt.subplot(len(numerical_cols), 3, i*3 + 2)
    sns.boxplot(x=X_clean[col], color="dodgerblue")
    plt.title(f'Boxplot of {col}')
    
    plt.subplot(len(numerical_cols), 3, i*3 + 3)
    res = stats.probplot(X_clean[col].dropna(), plot=plt)
    plt.plot(res[0][0], res[0][1], 'o', color="dodgerblue")
    plt.title(f'Q-Q Plot of {col}')

plt.tight_layout()
plt.show()

In [None]:
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    print(f"Found {outliers} outliers in {column} ({outliers/len(df)*100:.2f}%)")
    
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

for col in numerical_cols:
    if X_clean[col].nunique() > 10:
        X_clean = cap_outliers(X_clean, col)

In [None]:
plt.figure(figsize=(15, len(numerical_cols) * 4))

for i, col in enumerate(numerical_cols):
    plt.subplot(len(numerical_cols), 3, i*3 + 1)
    sns.histplot(X_clean[col], kde=True, color="dodgerblue")
    plt.title(f'Distribution of {col}')
    
    plt.subplot(len(numerical_cols), 3, i*3 + 2)
    sns.boxplot(x=X_clean[col], color="dodgerblue")
    plt.title(f'Boxplot of {col}')
    
    plt.subplot(len(numerical_cols), 3, i*3 + 3)
    res = stats.probplot(X_clean[col].dropna(), plot=plt)
    plt.plot(res[0][0], res[0][1], 'o', color="dodgerblue")
    plt.title(f'Q-Q Plot of {col}')

plt.tight_layout()
plt.show()

In [None]:
X_clean['date_time'] = pd.to_datetime(X_clean['date_time'])
Y_clean['date_time'] = pd.to_datetime(Y_clean['date_time'])
submission_clean['date_time'] = pd.to_datetime(submission_clean['date_time'])

In [None]:
print(f"X_clean date range: {X_clean['date_time'].min()} to {X_clean['date_time'].max()}")
print(f"Y_clean date range: {Y_clean['date_time'].min()} to {Y_clean['date_time'].max()}")
print(f"submission_clean date range: {submission_clean['date_time'].min()} to {submission_clean['date_time'].max()}")

In [None]:
X_clean['hour'] = X_clean['date_time'].dt.floor('H')

X_hourly = X_clean.groupby('hour').agg({
    col: 'mean' for col in numerical_cols
}).reset_index()

X_hourly = X_hourly.rename(columns={'hour': 'date_time'})

In [None]:
Y_clean['hour'] = Y_clean['date_time'].dt.floor('H')

common_hours = set(Y_clean['hour'].unique())
X_hourly_aligned = X_hourly[X_hourly['date_time'].isin(common_hours)]

Y_hourly = Y_clean.drop_duplicates(subset=['hour']).copy()
Y_hourly['date_time'] = Y_hourly['hour']
Y_hourly = Y_hourly.drop('hour', axis=1)

In [None]:
print(f"X_hourly shape before alignment: {X_hourly.shape}")
print(f"X_hourly shape after alignment: {X_hourly_aligned.shape}")
print(f"Y_hourly shape: {Y_hourly.shape}")

In [None]:
X_hourly = X_hourly_aligned.reset_index().drop("index", axis=1)

In [None]:
print(f"X_hourly date range: {X_hourly['date_time'].min()} to {X_hourly['date_time'].max()}")
print(f"Y_hourly date range: {Y_hourly['date_time'].min()} to {Y_hourly['date_time'].max()}")

# **3. Exploratory Data Analysis (EDA)**

In [None]:
numerical_cols = X_hourly.select_dtypes(include=['int64', 'float64']).columns

In [None]:
numerical_summary = X_hourly[numerical_cols].describe()[1:].T
numerical_summary['range'] = numerical_summary['max'] - numerical_summary['min']
numerical_summary['coefficient_of_variation'] = numerical_summary['std'] / numerical_summary['mean'] * 100
numerical_summary = numerical_summary.style.background_gradient(cmap='coolwarm')
display(numerical_summary)

In [None]:
if 'quality' in Y_hourly.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(Y_hourly['quality'], kde=True, color='dodgerblue')
    plt.title('Distribution of Target Variable (Quality)')
    plt.show()
    
    print("Summary statistics of target variable:")
    display(Y_hourly['quality'].describe())

In [None]:
# Correlation analysis
correlation_matrix = X_hourly[numerical_cols].corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', 
            center=0, square=True, linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# Identify highly correlated features
threshold = 0.7
high_corr_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            featurename = correlation_matrix.columns[i]
            high_corr_features.add(featurename)
            
print(f"Highly correlated features (|corr| > {threshold}):")
print(high_corr_features)

# **4. Feature Engineering**

In [None]:
X_engineered = X_hourly.copy()

In [None]:
X_engineered['hour'] = X_engineered['date_time'].dt.hour
X_engineered['dayofweek'] = X_engineered['date_time'].dt.dayofweek

In [None]:
X_engineered['hour_sin'] = np.sin(2 * np.pi * X_engineered['hour']/24)
X_engineered['hour_cos'] = np.cos(2 * np.pi * X_engineered['hour']/24)

In [None]:
X_engineered['dayofweek_sin'] = np.sin(2 * np.pi * X_engineered['dayofweek']/7)
X_engineered['dayofweek_cos'] = np.cos(2 * np.pi * X_engineered['dayofweek']/7)

In [None]:
X_engineered.drop(columns=["hour", "dayofweek"], inplace=True)

In [None]:
print(f"Feature engineered data shape: {X_engineered.shape}")

# **5. Train-Test Split**

In [None]:
print("X_hourly shape:", X_engineered.shape)
print("Y_hourly shape:", Y_hourly.shape)

In [None]:
training_data = pd.merge(
    X_engineered,
    Y_hourly[["date_time", "quality"]],
    on="date_time",
    how="inner"
)

In [None]:
print(f"Training data shape after merging: {training_data.shape}")

In [None]:
X_data = training_data.drop(["quality", "date_time"], axis=1)
y_data = training_data["quality"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0)

In [None]:
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

# 6. Standardization

In [None]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [None]:
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [None]:
print("Summary statistics after scaling (training set):")
display(X_train_scaled[numerical_cols].describe()[1:].T.style.background_gradient(cmap='coolwarm'))

# **7. Save Processed Data**

In [None]:
output_dir = '/kaggle/working/preprocessed'
os.makedirs(output_dir, exist_ok=True)

train_data = X_train_scaled.copy()
train_data['quality'] = y_train.values

test_data = X_test_scaled.copy()
test_data['quality'] = y_test.values

def dataframe_to_arff(df, relation_name):
    df = df.copy()
    
    if 'date_time' in df.columns:
        df = df.drop('date_time', axis=1)
    
    attributes = []
    for col in df.columns:
        attributes.append((col, 'NUMERIC'))
    
    data = df.values.tolist()
    
    arff_dict = {
        'relation': relation_name,
        'attributes': attributes,
        'data': data
    }
    
    return arff_dict

print("Training data columns:", train_data.columns.tolist())
print("Test data columns:", test_data.columns.tolist())

train_arff = dataframe_to_arff(train_data, 'quality_prediction_train')
with open(f'{output_dir}/train_data.arff', 'w') as f:
    f.write(arff.dumps(train_arff))

test_arff = dataframe_to_arff(test_data, 'quality_prediction_test')
with open(f'{output_dir}/test_data.arff', 'w') as f:
    f.write(arff.dumps(test_arff))

print(f"ARFF files created successfully:")
print(f"- {output_dir}/train_data.arff")
print(f"- {output_dir}/test_data.arff")

In [None]:
train_data.shape()

In [None]:
test_data.shape()