# Loan Data Analysis

## 0. Install Required Libraries

First, let's install all the required libraries for our analysis

**Note**: This notebook is configured for Google Colab:
1. It will first install required packages
2. You can either:
   - Upload the loan_data.csv directly when prompted
   - Or mount your Google Drive if the file is stored there

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn

# Import and configure libraries
try:
    import pandas as pd
    import numpy as np
except ImportError:
    !pip install pandas numpy
    import pandas as pd
    import numpy as np

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    # Configure plotting settings
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.rcParams['axes.grid'] = True
except ImportError:
    !pip install matplotlib seaborn
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.rcParams['axes.grid'] = True

try:
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
except ImportError:
    !pip install scikit-learn
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder

print("All required libraries are installed and imported successfully!")

# Try to use Google Colab for file upload
try:
    from google.colab import drive, files
    print("Running in Google Colab environment")
    
    # Mount Google Drive
    drive.mount('/content/drive')
    
    # Upload the data file
    uploaded = files.upload()
    
    # Load the uploaded data
    import io
    df = pd.read_csv(io.BytesIO(uploaded['loan_data.csv']))
except ImportError:
    print("Running in local environment")
    # Load data locally
    df = pd.read_csv('loan_data.csv')

# Display basic information
print("Dataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

: 

## 2. Data Visualization

### 2.1 Target Variable Analysis

In [None]:
# Analyze target variable distribution
plt.figure(figsize=(10, 6))
df[df.columns[-1]].value_counts().plot(kind='bar')
plt.title('Target Variable Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

### 2.2 Feature Analysis

Analyze numerical and categorical features separately

In [None]:
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numerical columns:", numerical_cols.tolist())
print("\nCategorical columns:", categorical_cols.tolist())

In [None]:
# Visualize categorical features
for col in categorical_cols[:3]:  # Start with first 3 columns
    plt.figure(figsize=(10, 6))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

## 3. Data Preprocessing

### 3.1 Missing Values Analysis

In [None]:
# Check missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

# Visualize missing values
plt.figure(figsize=(10, 6))
missing_values[missing_values > 0].plot(kind='bar')
plt.title('Missing Values by Column')
plt.xticks(rotation=45)
plt.show()

### 3.2 Handle Missing Values

In [None]:
# Fill numerical missing values with median
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Fill categorical missing values with mode
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

### 3.3 Outlier Detection and Treatment

In [None]:
# Function to detect outliers using IQR method
def detect_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    return outliers

# Check outliers in numerical columns
for col in numerical_cols:
    outliers = detect_outliers(df, col)
    if len(outliers) > 0:
        print(f"\nOutliers in {col}: {len(outliers)} values")
        
        # Box plot to visualize outliers
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[col])
        plt.title(f'Box Plot of {col}')
        plt.show()

### 3.4 Feature Engineering - Categorical Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# For binary categorical variables, use Label Encoding
le = LabelEncoder()
for col in categorical_cols:
    if df[col].nunique() == 2:
        df[f"{col}_encoded"] = le.fit_transform(df[col])

# For non-binary categorical variables, use One-Hot Encoding
for col in categorical_cols:
    if df[col].nunique() > 2:
        # Create dummy variables
        dummies = pd.get_dummies(df[col], prefix=col)
        # Add dummy variables to the dataframe
        df = pd.concat([df, dummies], axis=1)
        # Remove the original column
        df.drop(col, axis=1, inplace=True)

## 4. Logistic Regression Implementation from Scratch

In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
        
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        # Initialize parameters
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Gradient descent
        for _ in range(self.num_iterations):
            # Forward pass
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]