<a href="https://colab.research.google.com/github/dakshpaul146/SKill2/blob/main/Skill_LCA_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from io import StringIO

# Define the URLs of the datasets
url_1 = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
url_2 = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

# Load the first dataset (Iris dataset)
response_1 = requests.get(url_1)
data_1 = StringIO(response_1.text)
columns_1 = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']  # Define columns for Iris dataset
df1 = pd.read_csv(data_1, header=None, names=columns_1)

# Load the second dataset (Wine Quality dataset)
response_2 = requests.get(url_2)
data_2 = StringIO(response_2.text)
df2 = pd.read_csv(data_2, delimiter=';')

merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

print("Merged Dataset:")
print(merged_df.head())

Merged Dataset:
   sepal_length  sepal_width  petal_length  petal_width        class  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0.2  Iris-setosa   
4           5.0          3.6           1.4          0.2  Iris-setosa   

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            NaN               NaN          NaN             NaN        NaN   
1            NaN               NaN          NaN             NaN        NaN   
2            NaN               NaN          NaN             NaN        NaN   
3            NaN               NaN          NaN             NaN        NaN   
4            NaN               NaN          NaN             NaN        NaN   

   free sulfur dioxide  total sulfur dioxide  density  pH  sulphates  alcohol  \
0

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

try:
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
    df = pd.read_csv(url, header=None, names=columns)
    print("Dataset loaded successfully.")
except Exception as e:
    print("Error loading dataset:", e)
    exit()

print("\nFirst 5 rows:")
print(df.head())

print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

df.drop_duplicates(inplace=True)

try:
    le = LabelEncoder()
    df['class'] = le.fit_transform(df['class'])
except Exception as e:
    print("Error encoding class column:", e)

try:
    scaler = StandardScaler()
    num_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    df[num_cols] = scaler.fit_transform(df[num_cols])
except Exception as e:
    print("Error during standardization:", e)

print("\nCleaned Data:")
print(df.head())

Dataset loaded successfully.

First 5 rows:
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

Duplicates: 3

Cleaned Data:
   sepal_length  sepal_width  petal_length  petal_width  class
0     -0.915509     1.019971     -1.357737      -1.3357      0
1     -1.157560    -0.128082     -1.357737      -1.3357      0
2     -1.399610     0.331139     -1.414778      -1.3357      0
3     -1.520635     0.101529     -1.300696      -1.3357      0
4     -1.036535     1.249582     -1.357737      -1.3357      0


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv(url, header=None, names=columns)

print("Dataset Preview:")
print(df.head())

le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])
print("\nAfter Label Encoding:")
print(df[['class', 'class_encoded']].head())

df_onehot = pd.get_dummies(df['class'], prefix='class')
df = pd.concat([df, df_onehot], axis=1)
print("\nAfter One-Hot Encoding:")
print(df.head())

Dataset Preview:
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

After Label Encoding:
         class  class_encoded
0  Iris-setosa              0
1  Iris-setosa              0
2  Iris-setosa              0
3  Iris-setosa              0
4  Iris-setosa              0

After One-Hot Encoding:
   sepal_length  sepal_width  petal_length  petal_width        class  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from urllib.request import urlopen

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
columns = ["Class", "Alcohol", "Malic Acid", "Ash", "Alcalinity of Ash", "Magnesium",
           "Total Phenols", "Flavanoids", "Nonflavanoid Phenols", "Proanthocyanins",
           "Color Intensity", "Hue", "OD280/OD315", "Proline"]

df = pd.read_csv(url, names=columns)

print("Original Dataset Shape:", df.shape)
print(df.head())

X = df.drop(columns=["Class"])
y = df["Class"]

X.fillna(X.mean(), inplace=True)  # Fill missing values with column mean

minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)

standard_scaler = StandardScaler()
X_scaled = standard_scaler.fit_transform(X)

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\nBalanced Training Set Shape:", X_train_balanced.shape)
print("Class Distribution Before SMOTE:\n", y_train.value_counts())
print("Class Distribution After SMOTE:\n", pd.Series(y_train_balanced).value_counts())

Original Dataset Shape: (178, 14)
   Class  Alcohol  Malic Acid   Ash  Alcalinity of Ash  Magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   Total Phenols  Flavanoids  Nonflavanoid Phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color Intensity   Hue  OD280/OD315  Proline  
0             5.64  1.04         3.92     1065  
1             4.38  1.05  