In [2]:
# StackOverflow Developer Survey - Exploratory Data Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Setting display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Configure matplotlib and seaborn styles
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [7]:
# Load the dataset
print("\n1. Loading the dataset...")
df = pd.read_csv('../data/raw/survey_results_public.csv')
print("Dataset loaded successfully.")
print(f"Data set shape: {df.shape}")


1. Loading the dataset...
Dataset loaded successfully.
Data set shape: (49123, 170)


In [8]:
print("\n2. Initial Data Overview...")
print("-"*40)

print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]:,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nColumn names:")
for i, col in enumerate(df.columns):
    print(f"{i+1:2d}. {col}")


2. Initial Data Overview...
----------------------------------------
Number of rows: 49,123
Number of columns: 170
Memory usage: 375.68 MB

Column names:
 1. ResponseId
 2. MainBranch
 3. Age
 4. EdLevel
 5. Employment
 6. EmploymentAddl
 7. WorkExp
 8. LearnCodeChoose
 9. LearnCode
10. LearnCodeAI
11. AILearnHow
12. YearsCode
13. DevType
14. OrgSize
15. ICorPM
16. RemoteWork
17. PurchaseInfluence
18. TechEndorseIntro
19. TechEndorse_1
20. TechEndorse_2
21. TechEndorse_3
22. TechEndorse_4
23. TechEndorse_5
24. TechEndorse_6
25. TechEndorse_7
26. TechEndorse_8
27. TechEndorse_9
28. TechEndorse_13
29. TechEndorse_13_TEXT
30. TechOppose_1
31. TechOppose_2
32. TechOppose_3
33. TechOppose_5
34. TechOppose_7
35. TechOppose_9
36. TechOppose_11
37. TechOppose_13
38. TechOppose_16
39. TechOppose_15
40. TechOppose_15_TEXT
41. Industry
42. JobSatPoints_1
43. JobSatPoints_4
44. JobSatPoints_5
45. JobSatPoints_6
46. JobSatPoints_7
47. JobSatPoints_8
48. JobSatPoints_9
49. JobSatPoints_10
50. JobSa

In [9]:
print("\n3. Data types and basic info...")
print("-"*40)
print("\nData types summary:")
print(df.dtypes.value_counts())

print(f"\nDetailed info:")
df.info()


3. Data types and basic info...
----------------------------------------

Data types summary:
object     119
float64     50
int64        1
Name: count, dtype: int64

Detailed info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49123 entries, 0 to 49122
Columns: 170 entries, ResponseId to JobSat
dtypes: float64(50), int64(1), object(119)
memory usage: 63.7+ MB


In [10]:
# Missing data analysis
print("\n4. Missing data analysis...")
print("-"*40)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_data,
    'Missing_Percentage': missing_percent
}).sort_values('Missing_Percentage', ascending=False)

print("Top 20 columns with most missing values:")
print(missing_df.head(20))


4. Missing data analysis...
----------------------------------------
Top 20 columns with most missing values:
                                    Column  Missing_Count  Missing_Percentage
AIAgentObsWrite            AIAgentObsWrite          48859           99.462574
SOTagsWant Entry          SOTagsWant Entry          48693           99.124646
SOTagsHaveEntry            SOTagsHaveEntry          48666           99.069682
AIModelsWantEntry        AIModelsWantEntry          48649           99.035075
AIAgentOrchWrite          AIAgentOrchWrite          48646           99.028968
JobSatPoints_15_TEXT  JobSatPoints_15_TEXT          48459           98.648291
AIAgentKnowWrite          AIAgentKnowWrite          48358           98.442685
AIModelsHaveEntry        AIModelsHaveEntry          48348           98.422328
SO_Actions_15_TEXT      SO_Actions_15_TEXT          48300           98.324614
AIAgentExtWrite            AIAgentExtWrite          48265           98.253364
CommPlatformWantEntr  CommPlatf