In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('linked_in_data.csv')

# Display basic information about the dataset
print("Data Overview:")
print(df.info())

# Display the first few rows of the dataset
print("\nFirst 5 Entries:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())




In [None]:
def clean_data(dataframe):
    # Fill missing string data with an empty string or 'Unknown'
    dataframe.fillna('Unknown', inplace=True)   
    return dataframe

# Clean the data
df_cleaned = clean_data(df)

# Save the cleaned data back to CSV
df_cleaned.to_csv('linked_in_data_cleaned.csv', index=False)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load cleaned data
df = pd.read_csv('linked_in_data_cleaned.csv')

# Basic statistics
print("Basic Statistical Overview:")
print(df.describe())

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()


In [None]:
def feature_engineering(dataframe):
    # Create a new feature, e.g., 'Profile Completeness'
    dataframe['Profile Completeness'] = dataframe.count(axis=1) / len(dataframe.columns)
    
    # Example of categorizing a numerical column into bins
    dataframe['Connections Category'] = pd.cut(dataframe['Connections'], bins=[0, 100, 500, 1000, 5000], labels=['Low', 'Medium', 'High', 'Very High'])
    
    return dataframe

# Apply feature engineering
df_featured = feature_engineering(df_cleaned)

# Save the enhanced dataset
df_featured.to_csv('linked_in_data_featured.csv', index=False)


In [None]:
# Load the featured data
df = pd.read_csv('linked_in_data_featured.csv')

# Plotting distributions of 'Profile Completeness'
plt.figure(figsize=(8, 6))
sns.histplot(df['Profile Completeness'], bins=20, kde=True)
plt.title('Distribution of Profile Completeness')
plt.xlabel('Completeness')
plt.ylabel('Count')
plt.show()

# Relationship between 'Connections' and 'Followers'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Connections', y='Followers', data=df)
plt.title('Connections vs Followers')
plt.xlabel('Connections')
plt.ylabel('Followers')
plt.show()
