### Data Cleaning

In [8]:
import pandas as pd

# Load the heart dataset
heart_data = pd.read_csv('heart.csv')

# Display the first few rows of the dataset
print(heart_data.head())

# Check for missing values
missing_values = heart_data.isnull().sum()
print("\nMissing values in the dataset:")
print(missing_values)

# Remove rows with missing values
heart_data_clean = heart_data.dropna()

# Check for duplicates and remove them
heart_data_clean = heart_data_clean.drop_duplicates()

# Display the cleaned dataset
print("\nCleaned dataset:")
print(heart_data_clean.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  

Missing values in the dataset:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Cleaned dataset:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1     

### Data Integration

In [9]:
# Load the air quality dataset
air_quality_data = pd.read_csv('air.csv')

# Display the first few rows of the air quality dataset
print(air_quality_data.head())
# Concatenate the datasets vertically
merged_data = pd.concat([heart_data_clean, air_quality_data])# ignore_index=True)

# Display the first few rows of the concatenated dataset
print("\nConcatenated dataset:")
print(merged_data.head())


  air_quality_data = pd.read_csv('air.csv')


               time  station AMB_TEMP  CH4    CO  NMHC   NO NO2 NOx  O3  ...  \
0  2015/01/01 00:00  Banqiao       16  2.1  0.79  0.14  1.2  16  17  37  ...   
1  2015/01/01 01:00  Banqiao       16  2.1   0.8  0.15  1.3  16  17  36  ...   
2  2015/01/01 02:00  Banqiao       16  2.1  0.71  0.13    1  13  14  38  ...   
3  2015/01/01 03:00  Banqiao       15    2  0.66  0.12  0.8  11  12  39  ...   
4  2015/01/01 04:00  Banqiao       15    2  0.53  0.11  0.6  10  11  38  ...   

  RAINFALL RAIN_COND  RH  SO2  THC UVB WD_HR WIND_DIREC WIND_SPEED WS_HR  
0       NR        NR  57   12  2.2   0    69         69        4.7   4.2  
1       NR        NR  57   11  2.2   0    67         65          4     4  
2       NR        NR  57    8  2.2   0    63         53        3.7   3.5  
3       NR        NR  58  6.5  2.2   0    63         63        4.1   3.3  
4       NR        NR  58  5.5  2.1   0    69         67          3   3.1  

[5 rows x 23 columns]

Concatenated dataset:
    age  sex   cp  tres

### Data Transformation

In [10]:
# Display the column names of merged_data
print("\nColumn names in the merged_data DataFrame:")
print(merged_data.columns)



Column names in the merged_data DataFrame:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'time', 'station',
       'AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PH_RAIN',
       'PM10', 'PM2.5', 'RAINFALL', 'RAIN_COND', 'RH', 'SO2', 'THC', 'UVB',
       'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR'],
      dtype='object')


In [11]:
numerical_columns = [
    'age',  'sex',   'cp',  'trestbps',   'chol',  'fbs',  'restecg',  'thalach'
]
categorical_columns = [
    'RH', 'WD_HR','WIND_DIREC'
]


In [12]:
# Convert date/time columns to numeric (UNIX timestamp)
for col in numerical_columns:
    if merged_data[col].dtype == 'object':  # Check if column is of object type
        merged_data[col] = pd.to_datetime(merged_data[col], errors='coerce')
        merged_data[col] = merged_data[col].astype('int64')  # Convert to UNIX timestamp

# Handle missing values
merged_data[numerical_columns] = merged_data[numerical_columns].fillna(merged_data[numerical_columns].mean())


In [14]:
from sklearn.preprocessing import OneHotEncoder
for col in numerical_columns:
    merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

encoder = OneHotEncoder(sparse_output=False)



### Data Modeling

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('heart.csv')  # Replace 'your_dataset.csv' with your dataset file path

# Define the features and target variable
features = data.drop('target', axis=1)  # Replace 'target' with your target column name
target = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define numerical and categorical columns
numerical_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Create a column transformer
column_transformer = ColumnTransformer([
    ('scaler', StandardScaler(), numerical_columns),
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
])

# Create a RandomForestClassifier model
model = RandomForestClassifier(random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', model)
])

# Define hyperparameter grid for tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy on the test set: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model if needed
# import joblib
# joblib.dump(best_model, 'trained_model.pkl')


Accuracy on the test set: 86.89%

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [None]:
Sure, let's break down the code and explain each part:

1. **Importing Libraries**:
   ```python
   import pandas as pd
   ```
   This imports the Pandas library with an alias `pd`, which is commonly used for data manipulation and analysis.

2. **Loading Data**:
   ```python
   heart_data = pd.read_csv('heart.csv')
   ```
   This line reads a CSV file named `'heart.csv'` into a Pandas DataFrame called `heart_data`.

3. **Displaying Data**:
   ```python
   print(heart_data.head())
   ```
   This line prints the first few rows of the DataFrame `heart_data` to inspect the data.

4. **Checking for Missing Values**:
   ```python
   missing_values = heart_data.isnull().sum()
   print("\nMissing values in the dataset:")
   print(missing_values)
   ```
   This code calculates the sum of missing values in each column of the DataFrame `heart_data` and prints the result.

5. **Cleaning Data**:
   ```python
   heart_data_clean = heart_data.dropna().drop_duplicates()
   ```
   This line removes rows with missing values and duplicates from the DataFrame `heart_data` and assigns the cleaned data to `heart_data_clean`.

6. **Loading Another Dataset**:
   ```python
   air_quality_data = pd.read_csv('air.csv')
   ```
   This code reads another CSV file named `'air.csv'` into a new DataFrame called `air_quality_data`.

7. **Concatenating Data**:
   ```python
   merged_data = pd.concat([heart_data_clean, air_quality_data])
   ```
   This line concatenates the `heart_data_clean` DataFrame and `air_quality_data` DataFrame vertically (row-wise).

8. **Displaying Concatenated Data**:
   ```python
   print("\nConcatenated dataset:")
   print(merged_data.head())
   ```
   This code prints the first few rows of the concatenated DataFrame `merged_data`.

9. **Handling Numerical Columns**:
   ```python
   for col in numerical_columns:
       merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')
   ```
   This loop iterates over numerical columns and converts their data type to numeric, handling any errors by coercing invalid parsing to NaN.

10. **One-Hot Encoding**:
    ```python
    encoder = OneHotEncoder(sparse_output=False)
    ```
    This line initializes a OneHotEncoder object with the parameter `sparse_output=False`, which means it will return a dense array instead of a sparse matrix.

11. **Importing Libraries for Machine Learning**:
    ```python
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    ```
    These lines import various modules from scikit-learn (`sklearn`) library for machine learning tasks such as data preprocessing, model selection, and evaluation.

12. **Splitting Data for Training and Testing**:
    ```python
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    ```
    This code splits the data into training and testing sets using `train_test_split` function from scikit-learn.

13. **Creating a Pipeline**:
    ```python
    pipeline = Pipeline([
        ('preprocessor', column_transformer),
        ('classifier', model)
    ])
    ```
    This line creates a pipeline that includes preprocessing steps (scaling and encoding) and a machine learning model (RandomForestClassifier).

14. **Hyperparameter Tuning**:
    ```python
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    ```
    This code performs grid search cross-validation to find the best hyperparameters for the RandomForestClassifier model.

15. **Evaluation**:
    ```python
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    ```
    This code evaluates the best model on the test set and calculates the accuracy score.

16. **Printing Results**:
    ```python
    print(f"Accuracy on the test set: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    ```
    These lines print the accuracy score and classification report of the model on the test set.

This code essentially loads two datasets, cleans them, concatenates them, preprocesses the data, builds a machine learning pipeline, tunes hyperparameters, and evaluates the model's performance. It's a comprehensive example of a typical machine learning workflow using Python and scikit-learn.