<a href="https://colab.research.google.com/github/dhesika23/ML-StreamLit/blob/main/Dhesika_Karnan_Resolute_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q streamlit

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from scipy.spatial import distance
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data for Task 1 (Clustering)
train_data_path = "/content/drive/MyDrive/train.xlsx"
train_df = pd.read_excel(train_data_path)

# Load train data for Task 2 (Classification)
train_path_classification = "/content/drive/MyDrive/train.xlsx"
train_df_classification = pd.read_excel(train_path_classification)

# Load test data for Task 2 (Classification)
test_path_classification = "/content/drive/MyDrive/test.xlsx"
test_df_classification = pd.read_excel(test_path_classification)

# Load raw data for Task 3
raw_data_path_task3 = "/content/drive/MyDrive/rawdata.xlsx"
raw_data_task3 = pd.read_excel(raw_data_path_task3)

# Handle missing values for clustering
def handle_missing_values(df):
    columns_with_missing = df.columns[df.isnull().any()]
    for col in columns_with_missing:
        if df[col].dtype == 'object':
            imputer = SimpleImputer(strategy='most_frequent')
            df[col] = imputer.fit_transform(df[[col]]).ravel()
        else:
            imputer = SimpleImputer(strategy='mean')
            df[col] = imputer.fit_transform(df[[col]]).ravel()
    return df

# Handle outliers using winsorization for clustering
def handle_outliers(df):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        df[col] = winsorize(df[col], limits=[0.05, 0.05])
    return df

# Apply preprocessing to training data for clustering
train_df_cleaned = handle_missing_values(train_df)
train_df_cleaned = handle_outliers(train_df_cleaned)

# Removing the target data as it is not necessary for clustering
X_train_cluster = train_df_cleaned.drop('target', axis=1)

# Standardize features for clustering
scaler_cluster = StandardScaler()
X_train_scaled_cluster = scaler_cluster.fit_transform(X_train_cluster)

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_train_scaled_cluster)

# Calculate silhouette score for clustering
silhouette_avg = silhouette_score(X_train_scaled_cluster, cluster_labels)

# Add cluster labels to the training DataFrame for clustering using .loc to avoid SettingWithCopyWarning
train_df.loc[:, 'cluster'] = cluster_labels

# Identify the cluster and Mahalanobis distance for a given data point for clustering
def identify_cluster(data_point):
    cluster = kmeans.predict([data_point])[0]
    centroid = kmeans.cluster_centers_[cluster]
    mahalanobis_dist = distance.mahalanobis(data_point, centroid, np.linalg.inv(np.cov(X_train_scaled_cluster.T)))
    return cluster, mahalanobis_dist

# Load train data for classification
X_train_classification = train_df_classification.drop(['target'], axis=1)
y_train_classification = train_df_classification['target']

# Fill missing values with mean for classification
X_train_classification.fillna(X_train_classification.mean(), inplace=True)

# Standardize features for classification
scaler_classification = StandardScaler()
X_train_scaled_classification = scaler_classification.fit_transform(X_train_classification)

# Initialize the model for classification
clf_classification = RandomForestClassifier()

# Train the model for classification
clf_classification.fit(X_train_classification, y_train_classification)

# Make predictions on the train data for classification
y_pred_classification = clf_classification.predict(X_train_classification)

test_predictions_classification = clf_classification.predict(test_df_classification)

# Calculate train accuracy for classification
train_accuracy_classification = accuracy_score(y_train_classification, y_pred_classification)

# Create a DataFrame with row numbers and corresponding predicted targets for classification
test_predictions_df_classification = pd.DataFrame({
    'Row': test_df_classification.index + 1,
    'Target': test_predictions_classification
})

# Clean and process raw data for Task 3
def clean_and_process_data(raw_data):
    raw_data['date'] = pd.to_datetime(raw_data['date'])
    raw_data['date'] = raw_data['date'].dt.strftime('%d/%m/%Y')
    raw_data['time'] = raw_data['time'].astype(str)
    raw_data['datetime'] = pd.to_datetime(raw_data['date'] + ' ' + raw_data['time'])
    raw_data['duration'] = raw_data.groupby('number')['datetime'].diff().fillna(pd.Timedelta(seconds=0))
    return raw_data

# Clean and process the data for Task 3
processed_data_task3 = clean_and_process_data(raw_data_task3)

# Aggregate the data by date for Task 3
result_task3 = processed_data_task3.groupby('date').agg(
    pick_activities=('activity', lambda x: (x == 'picked').sum()),
    place_activities=('activity', lambda x: (x == 'placed').sum()),
    inside_duration=('duration', lambda x: x[processed_data_task3['position'] == 'inside'].sum()),
    outside_duration=('duration', lambda x: x[processed_data_task3['position'] == 'Outside'].sum())
)

# Streamlit app
def main():
    st.title("Clustering, Classification and Aggregation")

    # Task 1: Clustering
    st.header("Task 1: Machine Learning - Clustering")
    selected_data_point = st.sidebar.selectbox('Select a data point for clustering:', X_train_scaled_cluster.tolist())
    cluster, mahalanobis_dist = identify_cluster(np.array(selected_data_point))
    st.write(f'Selected Data Point: {selected_data_point}')
    st.write(f'Predicted Cluster: {cluster}')
    st.write(f'Mahalanobis Distance to Cluster Centroid: {mahalanobis_dist}')

    # Task 2: Classification
    st.header("Task 2: Machine Learning - Classification")
    st.write(f'Train Accuracy (Classification): {train_accuracy_classification}')
    st.write(test_predictions_df_classification)

    # Task 3: Data Aggregation
    st.header("Task 3: Python - Aggregating Data by Date")
    st.header("Processed Data:")
    st.write(processed_data_task3)
    st.header("Aggregated Result by Date:")
    st.write(result_task3)

# Run the Streamlit app
if __name__ == "__main__":
    main()

Overwriting app.py


In [None]:
!npm install -g localtunnel

[K[?25h/tools/node/bin/lt -> /tools/node/lib/node_modules/localtunnel/bin/lt.js
+ localtunnel@2.0.2
updated 1 package in 1.333s


In [None]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.16.166.115
[K[?25hnpx: installed 22 in 2.989s
your url is: https://neat-parrots-decide.loca.lt
