In [None]:
#1. 
# Import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
#2. 
# load the dataset
df= pd.read_csv(r'data.csv')

In [None]:
# Check data types
print(df.dtypes)

# Example: Convert a column to integer
# df['column_name'] = df['column_name'].astype(int)

In [None]:
#3. 
# at first check the data 
df.info()
df.head()
df.tail()
df.describe().T

In [None]:
#6. 
# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')


"df.columns = df.columns.str.lower().str.replace(' ', '_')\n"

In [None]:
# Check unique values in categorical columns
for col in df.select_dtypes(include='object').columns:
    print(f"Unique values in {col}:\n", df[col].unique())

In [None]:
#4. 
# checking for null values
df.isnull().sum()

In [None]:
#5. 
# handle the null values
# Example: Fill missing values with mode for categorical columns

for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Example: Fill missing values with median for numerical columns
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].median(), inplace=True)


"\nfor col in df.select_dtypes(include='object').columns:\n    df[col].fillna(df[col].mode()[0], inplace=True)\n\n# Example: Fill missing values with median for numerical columns\nfor col in df.select_dtypes(include='number').columns:\n    df[col].fillna(df[col].median(), inplace=True)\n"

In [None]:
# Label Encoding for binary categorical variables
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = label_enc.fit_transform(df[col])

In [None]:
# One-hot encoding for categorical variables
df = pd.get_dummies(df, columns=df.select_dtypes(include='object').columns, drop_first=True)

In [None]:
#7. 
# Check for duplicates

print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Remove duplicates
df.drop_duplicates(inplace=True)
print(f"Number of duplicate rows after removal: {df.duplicated().sum()}")

In [None]:
#8. 
#  Check for outliers using boxplot

# Example: Boxplot to visualize outliers
for col in df.select_dtypes(include='number').columns:
    sns.boxplot(data=df, x=col)
    plt.title(f'Boxplot of {col}')
    plt.show()

# Example: Remove outliers using the IQR method
for col in df.select_dtypes(include='number').columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]

SyntaxError: incomplete input (2332613448.py, line 2)

In [None]:
# Pair plot for numerical columns
sns.pairplot(df.select_dtypes(include='number'))
plt.show()

In [None]:
#9. 
# Select only numerical columns
numeric_df = df.select_dtypes(include='number')

# Check for multicollinearity using a heatmap
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Target varaible should be numeric 
# Distribution of the target variable
sns.countplot(data=df, x='leaveornot')
plt.title('Distribution of Target Variable')
plt.show()

# Relationship between target and other features
for col in ['age', 'joiningyear']:
    sns.boxplot(data=df, x='leaveornot', y=col)
    plt.title(f'Relationship between {col} and Target')
    plt.show()

In [None]:
#change x axis
sns.set(rc={'figure.figsize':(12,8)})
ax = sns.countplot(data = df, x = 'Pclass', color='green', edgecolor='black')

for bars in ax.containers:
    ax.bar_label(bars)

In [None]:
# Correlation with the target variable
correlation_with_target = df.corr()['leaveornot'].sort_values(ascending=False)
print("Correlation with Target Variable:\n", correlation_with_target)

In [None]:
# Check class distribution
sns.countplot(data=df, x='leaveornot')
plt.title('Class Distribution')
plt.show()

In [None]:
#Grouped Analysis
# Grouped statistics for numerical features
print("Grouped Statistics by Leave or Not:")
print(df.groupby('leaveornot')[['age', 'salary']].mean())

# Bar plot for average salary by leaveornot
df.groupby('leaveornot')['salary'].mean().plot(kind='bar', color='skyblue')
plt.title('Average Salary by Leave or Not')
plt.xlabel('Leave or Not')
plt.ylabel('Average Salary')
plt.show()

In [None]:
#Advanced Visualizations
# Violin plot for age distribution by leaveornot
sns.violinplot(data=df, x='leaveornot', y='age')
plt.title('Age Distribution by Leave or Not')
plt.show()

# Heatmap for categorical variables
sns.heatmap(pd.crosstab(df['department'], df['leaveornot']), annot=True, cmap='Blues', fmt='d')
plt.title('Department vs Leave or Not')
plt.show()

In [None]:
#Feature Importance
from sklearn.ensemble import RandomForestRegressor

# Feature importance using Random Forest Regressor
X = df.drop('Fare', axis=1)
y = df['Fare']

model = RandomForestRegressor()
model.fit(X, y)

# Plot feature importance
importance = pd.Series(model.feature_importances_, index=X.columns)
importance.sort_values(ascending=False).plot(kind='bar', figsize=(10, 6))
plt.title('Feature Importance')
plt.show()

In [None]:
# Check skewness of numerical columns
from scipy.stats import skew

for col in df.select_dtypes(include='number').columns:
    print(f"Skewness of {col}: {skew(df[col])}")

In [None]:
#Feature Scaling
from sklearn.preprocessing import MinMaxScaler

# Scale numerical columns
scaler = MinMaxScaler()
df[df.select_dtypes(include='number').columns] = scaler.fit_transform(df.select_dtypes(include='number'))

In [None]:
#Actual Columns and rows
#Code for a Line Graph
import matplotlib.pyplot as plt

# Example: Line graph for trends over time
# Replace 'year' and 'sales' with your actual column names
x = df['year']  # X-axis (e.g., years)
y = df['sales']  # Y-axis (e.g., sales data)

plt.plot(x, y, marker='o', linestyle='-', color='b', label='Sales Trend')
plt.title('Sales Trend Over Years')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#10. 
# VALIDATION AND ASSERTIONS
# -----------------------------------------------
# Replace inf values and drop NaNs

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
#Visualize Distributions Using Histograms
import matplotlib.pyplot as plt

# Plot histograms for numerical columns
for col in df.select_dtypes(include='number').columns:
    plt.hist(df[col], bins=20, alpha=0.7, edgecolor='black')
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
#11. 
# Histogram
# Histograms for numerical columns
df.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
# Final validation
print("Final dataset shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())