In [1]:
import os
import time
import requests
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# Configuration
MAX_RETRIES = 10
RETRY_DELAY = 30  # seconds
SURVEY_YEARS = list(range(2011, 2026))  # 2011 to 2025

print("Starting Stack Overflow Survey Data Download...")
print(f"Years to download: {min(SURVEY_YEARS)} to {max(SURVEY_YEARS)}")
print(f"Retry configuration: {MAX_RETRIES} max attempts, {RETRY_DELAY}s delay")
print(f"URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{{year}}.zip\n")


Starting Stack Overflow Survey Data Download...
Years to download: 2011 to 2025
Retry configuration: 10 max attempts, 30s delay
URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip



In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
def get_survey_urls(year):
    """
    Generate URL for a given survey year.
    All years use the same datasets ZIP pattern.
    """
    # All years use the same URL pattern
    url = f"https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip"
    return [url]

def fix_headers_for_older_years(df, year):
    """
    Fix headers for years 2011-2016 where the first two rows are headers.
    If the second row says "Response", use only the first row value.
    Otherwise, combine the first two rows.
    """
    if year >= 2011 and year <= 2015:
        if df.shape[0] < 2:
            print(f"  Warning: Not enough rows to fix headers for year {year}")
            return df
        
        # Get the first two rows
        first_row = df.iloc[0].astype(str)
        second_row = df.iloc[1].astype(str)
        
        # Create new column names
        new_columns = []
        for i, (first_val, second_val) in enumerate(zip(first_row, second_row)):
            first_val = first_val.strip()
            second_val = second_val.strip()
            
            # If second row is "Response", just use first row value
            if second_val == "Response":
                new_columns.append(first_val)
            else:
                # If both are the same or second is empty, use first
                if first_val == second_val or second_val == "":
                    new_columns.append(first_val)
                else:
                    # Combine both values - first value as primary
                    new_columns.append(f"{first_val} ({second_val})")
        
        # Set new column names
        df.columns = new_columns
        
        # Drop the first two rows (header rows)
        df = df.iloc[2:].reset_index(drop=True)
        
        print(f"  ✓ Fixed headers for year {year} (combined first two rows, removed 2 header rows)")
    
    return df

def download_file(url, year):
    """
    Download a file (single attempt, no retries).
    Returns the content if successful, None otherwise.
    Handles both CSV and ZIP files.
    """
    try:
        print(f"  Trying URL: {url}")
        response = requests.get(url, timeout=60, stream=True)
        response.raise_for_status()
        
        # Check content type
        content_type = response.headers.get('content-type', '').lower()
        
        # Reject HTML responses (likely error pages)
        if 'html' in content_type and response.status_code == 200:
            # Might be an error page, try next URL pattern
            print(f"  Warning: Received HTML instead of data file, may be wrong URL")
            return None
        
        content = response.content
        # Basic validation: check if content looks reasonable
        if len(content) < 100:
            print(f"  Warning: File too small, may be error page")
            return None
        
        # Check if it's a ZIP file by magic bytes
        is_zip = content[:2] == b'PK'  # ZIP files start with PK
        if is_zip:
            print(f"  ✓ Successfully downloaded {year} as ZIP ({len(content):,} bytes)")
        else:
            print(f"  ✓ Successfully downloaded {year} ({len(content):,} bytes)")
        
        return content
        
    except requests.exceptions.RequestException as e:
        print(f"  ✗ Download error: {str(e)}")
        return None


In [4]:
def validate_url(url):
    """
    Validate if a URL exists without downloading the full content.
    Returns True if the URL is valid and returns proper headers.
    """
    try:
        # Only get headers to check existence
        response = requests.head(url, timeout=10)
        return response.status_code == 200 and 'content-length' in response.headers
    except requests.exceptions.RequestException:
        return False

In [5]:
def download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY, sample_size=None):
    """
    Download and extract survey data for a given year with retry logic.
    Tries multiple URL patterns and handles both CSV and ZIP files.
    Wraps the entire process in retry logic to catch any runtime errors.
    
    Args:
        year: The survey year to download
        max_retries: Maximum number of retry attempts
        delay: Delay between retries in seconds
        sample_size: If provided, only read this many rows from the CSV (for testing/development)
    
    Returns:
        DataFrame if successful, None otherwise.
    """
    print(f"\n{'='*60}")
    print(f"Processing year {year}")
    print(f"{'='*60}")
    
    urls = get_survey_urls(year)
    
    # Outer retry loop for entire download/extract process
    # This will retry the entire process up to max_retries times if a RuntimeError occurs
    for retry_attempt in range(max_retries):
        try:
            # Try each URL pattern
            for url in urls:
                content = download_file(url, year)
                
                if content is None:
                    continue
                
                # Check if content is a ZIP file by magic bytes (ZIP files start with 'PK')
                is_zip = content[:2] == b'PK'
                
                if is_zip:
                    # Try to parse as ZIP
                    try:
                        with ZipFile(BytesIO(content)) as zip_file:
                            # Look for CSV files in the ZIP (exclude macOS metadata)
                            csv_files = [f for f in zip_file.namelist() 
                                       if f.endswith('.csv') and not f.startswith('__MACOSX/')]
                            if csv_files:
                                # Use the first CSV file found
                                csv_file = csv_files[0]
                                print(f"  Found CSV file in ZIP: {csv_file}")
                                with zip_file.open(csv_file) as f:
                                    # For years 2011-2016, read without header to fix manually
                                    read_kwargs = {
                                        'low_memory': False, 
                                        'on_bad_lines': 'skip',
                                        'nrows': sample_size  # Add sample size parameter
                                    }
                                    if year >= 2011 and year <= 2015:
                                        read_kwargs['header'] = None
                                    
                                    try:
                                        df = pd.read_csv(f, encoding='utf-8', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                                    except UnicodeDecodeError:
                                        f.seek(0)
                                        df = pd.read_csv(f, encoding='latin-1', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                            else:
                                print(f"  No CSV files found in ZIP archive")
                    except Exception as e:
                        print(f"  ZIP parsing failed: {str(e)}")
                        continue
                else:
                    # Try to parse as CSV directly
                    # For years 2011-2016, read without header to fix manually
                    read_kwargs = {
                        'low_memory': False, 
                        'on_bad_lines': 'skip',
                        'nrows': sample_size  # Add sample size parameter
                    }
                    if year >= 2011 and year <= 2016:
                        read_kwargs['header'] = None
                    
                    try:
                        df = pd.read_csv(BytesIO(content), encoding='utf-8', **read_kwargs)
                        print(f"  ✓ Successfully loaded {year} as CSV ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                        # Fix headers for older years
                        df = fix_headers_for_older_years(df, year)
                        return df
                    except UnicodeDecodeError:
                        # Try different encoding
                        try:
                            df = pd.read_csv(BytesIO(content), encoding='latin-1', **read_kwargs)
                            print(f"  ✓ Successfully loaded {year} as CSV with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                            # Fix headers for older years
                            df = fix_headers_for_older_years(df, year)
                            return df
                        except Exception as e:
                            print(f"  CSV parsing failed: {str(e)}")
                            continue
                    except Exception as e:
                        print(f"  CSV parsing failed: {str(e)}")
                        continue
            
            # If we get here, the URL failed - this triggers a retry if attempts remain
            if retry_attempt < max_retries - 1:
                print(f"  ✗ Download failed for year {year}")
                print(f"  Retrying entire process (attempt {retry_attempt + 2}/{max_retries}) in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed to download and extract data for year {year} after {max_retries} attempts")
                print(f"  URL attempted: {urls[0]}")
                return None
                
        except RuntimeError as e:
            print(f"  ✗ Runtime error on attempt {retry_attempt + 1}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts due to runtime error")
                return None
        except Exception as e:
            # Catch any other unexpected errors and retry
            print(f"  ✗ Unexpected error on attempt {retry_attempt + 1}: {type(e).__name__}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts")
                return None
    
    return None

In [6]:
# Define sample size for testing (set to None for full dataset)
SAMPLE_SIZE = 1000  # Adjust this value to control how many rows to read from each year

# Download and create dataframes for each year
dataframes = {}

for year in SURVEY_YEARS:
    df = download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY)
    if df is not None:
        # Add a year column to track which year the data is from
        df['SurveyYear'] = year
        dataframes[year] = df
    else:
        print(f"⚠ Skipping year {year} - download failed")

print(f"\n{'='*60}")
print(f"Download Summary")
print(f"{'='*60}")
print(f"Successfully downloaded: {len(dataframes)} out of {len(SURVEY_YEARS)} years")
print(f"Years downloaded: {sorted(dataframes.keys())}")
print(f"Years failed: {[y for y in SURVEY_YEARS if y not in dataframes]}")

# Display info for each dataframe
if dataframes:
    print(f"\n{'='*60}")
    print(f"DataFrame Information")
    print(f"{'='*60}")
    for year, df in sorted(dataframes.items()):
        print(f"Year {year}: {df.shape[0]:,} rows × {df.shape[1]:,} columns")


Processing year 2011
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2011.zip


  ✓ Successfully downloaded 2011 as ZIP (80,173 bytes)
  Found CSV file in ZIP: 2011 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2011 from ZIP with latin-1 encoding (2,815 rows, 65 cols)
  ✓ Fixed headers for year 2011 (combined first two rows, removed 2 header rows)

Processing year 2012
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2012.zip
  ✓ Successfully downloaded 2012 as ZIP (266,621 bytes)
  Found CSV file in ZIP: 2012 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2012 from ZIP with latin-1 encoding (6,245 rows, 75 cols)
  ✓ Fixed headers for year 2012 (combined first two rows, removed 2 header rows)

Processing year 2013
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2013.zip
  ✓ Successfully downloaded 2013 as ZIP (689,493 bytes)
  Found CSV file in ZIP: 2013 Stack Overflow Survey Responses.csv
  ✓ Successfully loaded 2013 from ZIP (9,744 rows, 128 cols)
  ✓ Fixed header

In [7]:
# Create combined dataframe from all years
if dataframes:
    print(f"\n{'='*60}")
    print(f"Creating Combined DataFrame")
    print(f"{'='*60}")
    
    # Ensure all dataframes have unique columns before concatenation
    # Find the union of all columns
    all_columns = set()
    for df in dataframes.values():
        all_columns.update(df.columns)
    all_columns = list(all_columns)

    # Reindex each dataframe to ensure unique columns for concat
    aligned_dfs = []
    for year, df in dataframes.items():
        # Remove duplicate columns if any (can happen on bad CSVs)
        df = df.loc[:,~df.columns.duplicated()]
        aligned_df = df.reindex(columns=all_columns)
        aligned_dfs.append(aligned_df)
    
    combined_df = pd.concat(aligned_dfs, ignore_index=True, sort=False)
    
    print(f"success!")
    print(f"  rows: {combined_df.shape[0]:,}")
    print(f"  columns: {combined_df.shape[1]:,}")
    print(f"  Years: {sorted(combined_df['SurveyYear'].dropna().unique())}")
    
    # Show basic info about the combined dataframe
    print(f"\n{'='*60}")
    print(f"Combined DataFrame Info")
    print(f"{'='*60}")
    print(combined_df.info())
    print(f"{'='*60}")
else:
    print("\n⚠ No dataframes were successfully downloaded. Cannot create combined dataframe.")
    combined_df = None



Creating Combined DataFrame
success!
  rows: 772,599
  columns: 1,087
  Years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772599 entries, 0 to 772598
Columns: 1087 entries, ImportantHiringRep to ProgramHobby
dtypes: float64(106), int64(1), object(980)
memory usage: 6.3+ GB
None


In [8]:
# Access individual year dataframes: dataframes[year]
# Access combined dataframe: combined_df
# Example:
if dataframes:
    print(f"\n{'='*60}")
    print(f"How to Access Your Data")
    print(f"{'='*60}")
    print(f"Individual year dataframes:")
    print(f"  - dataframes[2024]  # Access 2024 data")
    print(f"  - dataframes[2023]  # Access 2023 data")
    print(f"  - etc.")
    print(f"\nCombined dataframe:")
    print(f"  - combined_df  # All years combined")
    print(f"\nAvailable years: {sorted(dataframes.keys())}")
    
    # Quick preview of the combined dataframe
    if combined_df is not None:
        print(f"\n{'='*60}")
        print(f"Combined DataFrame Preview (first 5 rows)")
        print(f"{'='*60}")
        print(combined_df.head())



How to Access Your Data
Individual year dataframes:
  - dataframes[2024]  # Access 2024 data
  - dataframes[2023]  # Access 2023 data
  - etc.

Combined dataframe:
  - combined_df  # All years combined

Available years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Preview (first 5 rows)
  ImportantHiringRep EquipmentSatisfiedRAM EduOther  HoursPerWeek LearnCodeAI  \
0                NaN                   NaN      NaN           NaN         NaN   
1                NaN                   NaN      NaN           NaN         NaN   
2                NaN                   NaN      NaN           NaN         NaN   
3                NaN                   NaN      NaN           NaN         NaN   
4                NaN                   NaN      NaN           NaN         NaN   

   JobSatPoints_16 company_size_range WebframeWantEntry AssessJobCompensation  \
0              NaN                NaN               NaN                   NaN

In [9]:
combined_df.head()

Unnamed: 0.1,ImportantHiringRep,EquipmentSatisfiedRAM,EduOther,HoursPerWeek,LearnCodeAI,JobSatPoints_16,company_size_range,WebframeWantEntry,AssessJobCompensation,TechOppose_2,nan (Training & Education: Other),why_stack_overflow,LanguageDesireNextYear,EthicalImplications,CompTotal,Knowledge_6,nan (Training & Education: Masters in CS),nan (Fixing bugs),nan (Future Lang & Tech: R),YearsCodePro,nan (Desktop Operating System),nan (Why try Stack Overflow Careers: Selection of revelant jobs),AIToolDon't plan to use AI for this task,PurchaseHow,dogs_vs_cats,nan (TypeScript),AgreeDisagree2,EquipmentSatisfiedMonitors,nan (Servers),nan (Perception of contact form: Twitter),nan (Current Lang & Tech: Cordova),Please rate the advertising you've seen on Stack Overflow (The ads are relevant),"nan (Link to a Stack Overflow Careers Company Page or other source of more information about the company (videos, articles, etc))",FizzBuzz,InfluenceVizTools,StackOverflowModeration,nan (How frequently land on or read Stack Overflow),nan (Current Lang & Tech: SQL Server),HaveWorkedPlatform,employment_status,interview_likelihood,EquipmentSatisfiedRW,SOVisit1st,nan (Future Lang & Tech: Redis),job_discovery,nan (Commuting),nan (Limited night / weekend work),What types of purchases are you involved in? (Hardware),nan (Future Lang & Tech: Scala),DatabaseDesireNextYear,CurrencyDesc,nan (Current Lang & Tech: Redis),nan (Appealing message traits: Stack Overflow Company Page),DatabaseWantToWorkWith,WorkChallenge,ICorPM,How likely is it that a recommendation you make will be acted upon?,nan (Most urgent info about job opportunity: Job title),WebframeAdmired,AssessJobDiversity,EmploymentStatus,Knowledge_8,Methodology,nan (Node.js),AdsPriorities2,nan (Recommender),nan (Other tablet),SOComm,nan,AIAgentChallengesNeutral,AISelect,Which best describes the size of your company?,How old are you?,ProblemSolving,Frustration,nan (Current Lang & Tech: Hadoop),PlatformWorkedWith,CurrencySymbol,AdsAgreeDisagree2,nan (Consultants),Accessibility,"You answered you don't have a Careers profile, can you elaborate why?",nan (Current Lang & Tech: Cassandra),WantWorkDatabase,Which of our sites do you frequent most?,nan (CSS),nan (Future Lang & Tech: Java),StackOverflowDevices,QuestionsConfusing,How many people work for your company?,nan (Most important aspect of new job opportunity: Company size),BlockchainOrg,salary_midpoint,PurchaseInfluence,How often do you find solutions to your programming problems on Stack Overflow without asking a new question?,big_mac_index,remote,JobEmailPriorities1,TechEndorse,TechEndorse_1,LanguageChoice,HighestEducationParents,nan (Blackberry),TabsSpaces,CompanySize,nan (Most important aspect of new job opportunity: Tech stack),FormalEducation,StackOverflowRecommend,SkipMeals,Employment,nan (Future Lang & Tech: C++),DeveloperType,How do you prefer to be contacted about job opportunities? (Email),Ethnicity,TimeSearching,country,AssessJobIndustry,DifficultCommunication,LanguageWorkedWith,AISent,nan (Check Writer),PlatformWantToWorkWith,CousinEducation,SOTimeSaved,CommPlatformHaveEntr,agree_adblocker,"nan ($10,001 - $25,000)",SOJobs,StackOverflowAdsRelevant,nan (Most urgent info about job opportunity: Company name),HypotheticalTools1,InTheZone,SurveyEasy,WebframeHaveEntry,nan (Future Lang & Tech: Cassandra),Knowledge_5,nan (Appealing message traits: Salary information),WebFrameDesireNextYear,nan (Most important aspect of new job opportunity: Health insurance),JobSatPoints_1,"nan (>$150,000)",Do you have a Stack Overflow Careers Profile?,nan (I like that I can indicate ads I want to see less of),Exercise,How often do you visit job boards?,nan (Training & Education: Online Class),JobEmailPriorities7,AssessJob5,AnnoyingUI,DatabaseWantEntry,nan (Future Lang & Tech: iOS),nan (Current Lang & Tech: C++),nan (Purchasing Power),nan (Prefered Source Control: write-in),TBranch,AIDevHaveWorkedWith,AIBen,AssessJob8,nan (Most urgent info about job opportunity: Benefits),nan (Current Lang & Tech: C#),nan (Technical support),nan (How can companies improve interview process: Gimme coffee),"nan ($100,001 - $150,000)",nan (Identification With the Company/Goals),AssessJob4,WebframeWorkedWith,nan (Future Lang & Tech: PHP),InfluenceConsultants,nan (Source control used: SVN),Knowledge_4,JobEmailPriorities2,nan (Current Lang & Tech: Ruby),WakeTime,education,nan (Future Lang & Tech: Objective-C),AIExplain,nan (Future Lang & Tech: Python),nan (Autonomy Over Budget/Expenditures),nan (Current Lang & Tech: Python),"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",AIAgentOrchWrite,AIHuman,StackOverflowCompanyPage,OfficeStackSyncWantToWorkWith,UpdateCV,important_wfh,NumberMonitors,ResumeUpdate,Which US State or Territory do you live in?,nan (Who do you want to communicate with about a new job opportunity: In-house tech recruiter),nan (Xbox),StackOverflowAdsDistracting,"nan (I influence purchasing decisions, but don't have final approval)",LanguageAdmired,ImportantHiringPMExp,AssessBenefits4,Hobbyist,YearsCodingProf,nan (How can companies improve interview process: Flexible interview schedule),EmbeddedHaveWorkedWith,TechOppose_7,AIAgentChallengesSomewhat agree,AdsPriorities1,Which of the following best describes your occupation?,MainBranch,JobSatPoints_13,Respondent,nan (Most annoying about job search: Finding interesting job),important_ownoffice,InfluenceCloud,AIModelsWantEntry,team_size_range,nan (I can buy anything I want without asking anyone),AIDangerous,EmbeddedAdmired,nan (CoffeeScript),AssessJobTech,HopeFiveYears,nan (Xbox 360),nan (Includes salary information),AIToolInterested in Using,AINextLess integrated,nan (Appealing message traits: Team described),DevEnvsHaveWorkedWith,JobSatPoints_10,nan (Python),nan (Future Lang & Tech: Write-In),nan (How often contacted by recruiters),AssessJobProduct,occupation_group,important_promotion,TechOppose_16,so_region,DatabaseChoice,nan (Open to new job opportunities),ExCoderWillNotCode,WelcomeChange,new_job_value,How would you best describe the industry you currently work in?,CompetePeers,"nan (User Equipment: Monitors, PCs, Laptops)",nan (Future Lang & Tech: Dart),nan (Future Lang & Tech: Haskell),AIEthics,nan (Human Resources),TechOppose_13,nan (Grants / outside fund-raising),NEWOvertime,What other departments / roles do you interact with regularly? (System Administrators),nan (Describes benefits / perks of the work environment),nan (Wii),nan (Most important aspect of new job opportunity: Office location),ToolsTechHaveWorkedWith,US_State,SO_Actions_10,StackOverflowDescribes,nan (Future Lang & Tech: C++11),AIModelsHaveEntry,ExCoderBelonged,nan (Most important aspect of new job opportunity: Remote working),AssessJob2,nan (Future Lang & Tech: Hadoop),important_control,agree_notice,AssessJob6,Select up to 3 (Most annoying about job search: Finding time),ToolsTechAdmired,JobEmailPriorities4,DevEnvsWantToWorkWith,ImportantHiringTechExp,CollaborateRemote,ImportantHiringGettingThingsDone,nan (Xbox One),nan (Current Lang & Tech: PHP),nan (jQuery),agree_nightcode,MetricAssess,DiversityImportant,VCHostingProfessional use,VersionControlSystem,nan (Other),nan (Learning new skills),nan (I have a discretionary budget at my disposal),NEWStuck,nan (Android Tablet),nan (Current Lang & Tech: AngularJS),nan (Why answer: Help future programmers),nan (Testers / Quality Assurance),nan (Other media streaming device),nan (Employment Status),nan (Technical Support),How large is the team that you work on?,ImportantHiringOpenSource,SOHowMuchTime,ExCoderSkills,What is your current Stack Overflow reputation?,Frequency_3,SOTagsHaveEntry,nan (Describes company culture),nan (Why use Stack Overflow: Receive help on personal projects),PurchaseWhat,Which technologies are you excited about? (Node.js),nan (What ads? I use an ad blocker),What type of project are you developing?,nan (Convenient Commute or Telecommute Options),AssessJob10,WebframeHaveWorkedWith,nan (Software),SOFindAnswer,nan (How important is remote when evaluating new job opportunity?),YearsCodedJob,InfluenceTechStack,TechEndorse_3,nan (PhoneGap),NEWCollabToolsHaveWorkedWith,nan (HDTV),RemoteWork,OfficeStackSyncHaveWorkedWith,LanguagesHaveEntry,TechEndorse_7,AIAgents,StackOverflowParticipate,CompFreq,VCHostingPersonal use,nan (Looking for a job),open_to_new_job,nan (Who do you want to communicate with about a new job opportunity: Headhunter),WorkStart,Race,Trans,WorkExp,NEWOffTopic,nan (Go),StackOverflowCommunity,SurveyLength,nan (Current Lang & Tech: F#),desktop_os,JobProfile,developer_challenges,nan (Haskell),nan (Mentions my code or Stack Overflow activity),why_learn_new_tech,AISearchDevAdmired,AIAgentChange,age_range,BuildingThings,nan (Who do you want to communicate with about a new job opportunity: Manager),AssessJobOffice,JobEmailPriorities3,AIAgentImpactStrongly disagree,AIToolPlan to partially use AI,ResponseId,nan (Opportunity to Use/Learn New Technologies),What operating system do you use the most?,nan (Current Lang & Tech: Dart),What advertisers do you remember seeing on Stack Overflow? (Open-Ended Response),SocialMedia,PlatformHaveEntry,SurveyYear,JobSatPoints_8,ImportantHiringAlgorithms,nan (Redis),EthicsResponsible,OfficeStackAsyncHaveWorkedWith,nan (other (please specify)),OfficeStackAsyncAdmired,OfficeStackWantEntry,nan (Future Lang & Tech: C#),EquipmentSatisfiedStorage,BuyNewTool,SO_Actions_4,Blockchain,agree_tech,nan (Regular Mobile Phone),nan (Gender),UnitTests,BetterLife,nan (Most important aspect of new job opportunity: Flexible work options),HoursOutside,WorkWeekHrs,nan (F#),nan (Future Lang & Tech: SQL Server),AdsPriorities6,NEWJobHunt,CommPlatformWantToWorkWith,SexualOrientation,Knowledge_2,nan (Current Lang & Tech: Spark),JobSecurity,nan (Current Lang & Tech: JavaScript),nan (Current Lang & Tech: Visual Basic),InfluenceServers,nan (PS3),StackOverflowAnswer,SOPartFreq,job_satisfaction,AssessBenefits7,DevType,nan (Cordova),nan (How can companies improve interview process: Remote interviews),nan (Perception of contact form: Facebook),commit_frequency,NEWCollabToolsWorkedWith,nan (Blu-Ray),Age,NEWDevOpsImpt,JobSeek,CommPlatformHaveWorkedWith,Select up to 3 (How can companies improve interview process: More live code),LearnCodeOnline,nan (Most important aspect of new job opportunity: Equity),AdsActions,MiscTechAdmired,ImportantHiringCompanies,SO_Actions_16,nan (Source control used: I don't use source control),CommPlatformWantEntr,UnderstandComputers,Which desktop operating system do you use the most?,nan (The ads are Informative),AssessJobRemote,AIFrustration,AIModelsChoice,SOTagsWantToWorkWith,salary_range,nan (Future Lang & Tech: Sharepoint),ChallengeMyself,Frequency_2,nan (Why answer: I don't answer and I don't want to),IDE,AssessBenefits5,nan (Dart),nan (Perception of contact form: Email),nan (Why try Stack Overflow Careers: Jobs are on Stack Overflow),nan (Other netbook),"In an average week, how do you spend your time at work? (Developing new features)",ClickyKeys,StackOverflowDevStory,AssessJobExp,TechList,nan (Finance),nan (Current Lang & Tech: LAMP),AINextVery similar,SO_Actions_5,Knowledge_7,nan (Appealing message traits: Company culture described),StackOverflowSatisfaction,What best describes your career / job satisfaction?,nan (Future Lang & Tech: MongoDB),StackOverflowBetter,nan (Surfing the Internet),StackOverflowVisit,nan (Future Lang & Tech: CoffeeScript),AssessBenefits8,JobSat,InfluenceDatabase,nan (Training & Education: Boot camp or night school),SurveyEase,How often are you contacted by recruiters?,nan (Hadoop),AdsPriorities4,CodeRev,nan (Current Lang & Tech: Clojure),AssessBenefits3,NEWOtherComms,nan (Nook),EmbeddedWantToWorkWith,CommPlatformAdmired,AIModelsAdmired,ExCoderNotForMe,CareerSatisfaction,nan (WinRT),DevEnvsChoice,SONewContent,nan (Age),AIThreat,PlatformAdmired,nan (Boxee),AIModelsWantToWorkWith,BoringDetails,Sexuality,AdsPriorities3,nan (Future Lang & Tech: LAMP),What Country do you live in?,nan (How often are Stack Overflow's answers helpful),nan (Future Lang & Tech: Salesforce),HaveWorkedDatabase,How did you find out about your current job?,nan (Job Satisfaction),MiscTechHaveWorkedWith,nan (Source control used: TFS),SalaryType,nan (Future Lang & Tech: F#),important_buildexisting,nan (I click on ads that interest me),TrueFalse_3,AdBlockerDisable,JobSatisfaction,nan (Most annoying about job search: Finding job I'm qualified for),nan (Most urgent info about job opportunity: Tech stack),nan (AngularJS),nan (Room for Growth of Skills/Knowledge),nan (C),nan (Future Lang & Tech: Node.js),nan (Source control used: Perforce),Are you currently looking for a job or open to new opportunities?,"Including bonus, what is your annual compensation in USD?",AssessJobLeaders,JobEmailPriorities5,nan (Appealing message traits: Code or projects mentioned),nan (Kindle Fire),Select all that apply (Training & Education: No formal training),nan (Future Lang & Tech: Wordpress),nan (Current Lang & Tech: Haskell),YearsCode,AIAcc,HaveWorkedFramework,self_identification,PlatformChoice,SOHow,important_sameend,WebframeChoice,SO_Actions_3,NEWCollabToolsDesireNextYear,nan (Kindle),DatabaseHaveWorkedWith,MiscTechWorkedWith,TimeAnswering,LanguagesWantEntry,StackOverflowJobListing,occupation,WebDeveloperType,JobContactPriorities3,AINextSomewhat similar,nan (Future Lang & Tech: Cloud),SOTagsHaveWorkedWith,nan (Customers),nan (Training & Education: On the job),AssessJobProjects,AISearchDevWantToWorkWith,TechOppose_5,nan (Training & Education: PhD in CS),women_on_team,nan (Training & Education: Mentorship),Currency,AuditoryEnvironment,nan (Current Lang & Tech: Perl),AIAgentChallengesStrongly agree,EthicsChoice,Onboarding,AIChallenges,rep_range,LearnedHiring,StackOverflowJobs,nan (How can companies improve interview process: Introduce me to boss),AssessJobCommute,SOTagsAdmired,nan (Ruby),nan (Lots of Control Over Your Own Work),nan (Current Lang & Tech: R),nan (Why use Stack Overflow: Communicate with others),WantWorkPlatform,OrgSize,SOVisitTo,AssessBenefits6,InfluenceInternet,nan (Country),nan (Why try Stack Overflow Careers: Jobs site for programmers),VCInteraction,Knowledge_1,nan (Source control used: DCVS),nan (Current Lang & Tech: iOS),unit_testing,Professional,EthicsReport,TechEndorse_13_TEXT,AINextNo change,AIDevWantToWorkWith,Select all that apply (Why try Stack Overflow Careers: No spam),JobSatPoints_14,AdsAgreeDisagree1,nan (Don't know),AssessJob1,CommunicationTools,nan (Future Lang & Tech: Go),nan (High Caliber Team (is everyone else smart/hardworking)),HypotheticalTools4,NEWEdImpt,NEWDevOps,Frequency_1,ExCoderBalance,nan (Prefered IDE theme),HypotheticalTools5,nan (Compensation: midpoint),EdLevel,InfluenceRecruitment,nan (Most important aspect of new job opportunity: Building something that matters),AdsPriorities5,"nan ($25,001 - $40,000)","nan ($75,001 - $100,000)",nan (Most important aspect of new job opportunity: Advancement),StackOverflowJobsRecommend,nan (How many hours programming as hobby per week?),nan (Meetings),ImportantHiringTitles,nan (Why answer: Self promotion),Knowledge_3,"If your company has a native mobile app, what platforms do you support? (iPhone)",FrameworkDesireNextYear,nan (How can companies improve interview process: Better preparation),OfficeStackSyncAdmired,AIToolCurrently mostly AI,StackOverflowFoundAnswer,AssessBenefits1,nan (No mobile app),OperatingSystem,AdBlockerReasons,MilitaryUS,WebframeDesireNextYear,SO_Dev_Content,Were you aware of the Apptivate contest?,Which languages are you proficient in? (Java),OtherPeoplesCode,AIComplex,OpSys,nan (Ask questions to solve problems),LanguageWantToWorkWith,nan (Arduino / Raspberry Pi),Did you participate in the Apptivate contest?,nan (JavaScript),nan (Mobile app sales),Where do you work remotely most of the time?,Salary,nan (Product Managers),nan (Most important aspect of new job opportunity: Job title),TechDoc,nan (Current Lang & Tech: CoffeeScript),"In the last 12 months, how much money have you spent on personal technology-related purchases?",job_search_annoyance,AISearchHaveWorkedWith,DatabaseHaveEntry,ScreenName,Extraversion,SOTagsWant Entry,TechOppose_3,AIAgentImpactNeutral,AssessBenefits2,nan (C++),AISearchWantToWorkWith,NEWOnboardGood,nan (Perception of contact form: Stack Overflow Careers),AISearchDevHaveWorkedWith,agree_legacy,AIAgentExtWrite,YearsCoding,AINextVery different,ConvertedComp,MgrMoney,YearsProgram,NEWSOSites,Q120,Select up to 3 (Most important aspect of new job opportunity: Salary),TechEndorseIntro,nan (Current Lang & Tech: C++11),Have you changed jobs in the last 12 months?,SelfTaughtTypes,nan (Other (please specify)),nan (Training & Education: BS in CS),JobSatPoints_15,JobSatPoints_4,PlatformWantEntry,agree_mars,TimeAfterBootcamp,InfluenceHardware,AIFuture,AIToolNot interested in Using,AILearnHow,nan (Java),JobSatPoints_9,nan (Prefered Source Control),Do you work remotely?,nan (Purchaser),AssessJob9,dev_environment,AgentUsesGeneral,SO_Actions_7,HomeRemote,agree_problemsolving,University,AssessBenefits10,PlatformDesireNextYear,OpenSourcer,nan (Why use Stack Overflow: I don't use Stack Overflow),JobSeekingStatus,Do you enjoy working remotely?,nan (Current Lang & Tech: MongoDB),nan (Future Lang & Tech: JavaScript),nan (Office in a Desirable City/Area),nan (Looking for a new job),nan (Future Lang & Tech: Matlab),AssessJobProfDevel,WorkRemote,nan (The ads are entertaining),CodeRevHrs,WebFrameWorkedWith,nan (Netbook),InterestedAnswers,nan (Excitement About the Company's Products),LanguageHaveWorkedWith,"Please rate how important each of the following characteristics of a company/job offer are to you. Please select a MAXIMUM of 3 items as ""Non-Negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (High Base Compensation)",SOAccount,ExCoderReturn,AdBlocker,nan (Perception of contact form: LinkedIn),What is your gender?,nan (HTML5),nan (How can companies improve interview process: Fewer brainteasers),NonDeveloperType,nan (SQL),CompanyType,RaceEthnicity,AIAgentChallengesStrongly disagree,nan (Phone),AIAgent_Uses,nan (Future Lang & Tech: Windows Phone),nan (Refactoring / code quality),industry,NEWPurpleLink,nan (Most urgent info about job opportunity: Product details),"nan ($41,000 - $75,000)",AIAgentExternal,gender,nan (Who do you want to communicate with about a new job opportunity: Developer),nan (Objective-C),"Including yourself, how many developers are employed at your company?",JobContactPriorities1,LearnCodeChoose,aliens,nan (Most important aspect of new job opportunity: Industry),TechEndorse_8,nan (Current Lang & Tech: Node.js),important_newtech,"In an average week, how do you spend your time? (Developing new features)",YearsCodedJobPast,SO_Actions_15,nan (Future Lang & Tech: Spark),CheckInCode,nan (PHP),AIAgentObsWrite,SODuration,LearningNewTech,AINextMore integrated,Select all that apply (Most urgent info about job opportunity: Salary),agree_diversity,AIAgentKnowledge,tech_want,nan (Years IT / Programming Experience),"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",nan (Future Lang & Tech: Perl),DevEnviron,Have you visited / Are you aware of Stack Overflow Careers?,tech_do,Select all that apply (Current Lang & Tech: Android),OpSysPersonal use,nan (Why use Stack Overflow: Can't do job without it),AIAgentObserveSecure,nan (Why use Stack Overflow: To give help),AssessBenefits11,ExCoder10Years,MiscTechDesireNextYear,Have you visited / Are you aware of Stack Overflow Careers 2.0?,nan (Why try Stack Overflow Careers: Other),nan (Consulting),agree_alcohol,Hobby,HypotheticalTools3,nan (Most important aspect of new job opportunity: Work - Life balance),Which of the following languages or technologies have you used significantly in the past year? (C),nan (Why use Stack Overflow: Demonstrate expertise),nan (C++11),ShipIt,CareerSat,NEWJobHuntResearch,What is your involvement in purchasing? You can choose more than 1. (Influencer),nan (Designers),nan (Perception of contact form: Xing),un_subregion,ProfessionalTech,agree_loveboss,nan (JQuery),JobSatPoints_11,NEWCollabToolsAdmired,nan (Why try Stack Overflow Careers: Showcase Stack Overflow activity),nan (Future Lang & Tech: Arduino),nan (Current Lang & Tech: Matlab),AssessJobRole,What Country or Region do you live in?,nan (Describes the team I will work on),JobEmailPriorities6,MiscTechWantToWorkWith,AgreeDisagree1,AIAgentImpactStrongly agree,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",nan (Current Lang & Tech: C),nan (Why use Stack Overflow: Maintain online presence),How would you best describe the industry you work in?,how_to_improve_interview_process,Dependents,ConvertedCompYearly,WantWorkFramework,nan (Preferred text editor),nan (Most important aspect of new job opportunity: Company culture),nan (Other gaming system),Select all that apply (Why use Stack Overflow: Help for job),nan (Stack Overflow Careers Message),AINextNeither different nor similar,nan (Most urgent info about job opportunity: Office location),nan (Current Lang & Tech: Sharepoint),nan (Compensation),EducationImportant,SOAI,TechOppose_1,AIAgentImpactSomewhat agree,nan (Current Lang & Tech: Rust),JobSatPoints_5,nan (Android phone),nan (Direct sales to consumers),nan (Tabs or Spaces),OfficeStackAsyncWantToWorkWith,nan (How can companies improve interview process: Show me workplace),nan (Current Lang & Tech: Swift),important_buildnew,ITperson,TechOppose_9,AIModelsHaveWorkedWith,TimeFullyProductive,WebframeWantToWorkWith,hobby,nan (Other Smart Phone),AIAgentOrchestration,AIResponsible,"If you make a software product, how does your company make money? (You can choose more than one) (Advertising)",nan (Why answer: Demonstrate expertise),nan (iPad),nan (Preferred text editor: write-in),nan (Most important aspect of new job opportunity: Company stage),StackOverflowMakeMoney,AdsPriorities7,nan (Future Lang & Tech: Visual Basic),ImpSyn,star_wars_vs_star_trek,AssessJobDept,LastInt,nan (Current Lang & Tech: Cloud),TechOppose_15,UndergradMajor,JobSatPoints_6,nan (Build my online reputation),nan (Source control used: write-in),ChangeWorld,nan (How can companies improve interview process: Introduce me to team),TrueFalse_2,NEWCollabToolsWantToWorkWith,AssessBenefits9,Check,nan (Who do you want to communicate with about a new job opportunity: In-house recruiter),ExCoderActive,TrueFalse_1,FriendsDevelopers,nan (Future Lang & Tech: C),DevEnvHaveEntry,LastHireDate,nan (Current Lang & Tech: Go),nan (Why answer: No idea),nan (Current Lang & Tech: Windows Phone),nan (Current Lang & Tech: Wordpress),Country,"nan (Training & Education: Some college, but no CS degree)",TechEndorse_13,SO_Actions_9,AIInteresting,SurveyTooLong,ExpectedSalary,nan (Using Stack Exchange),AIToolPlan to mostly use AI,NEWPurchaseResearch,nan (Android tablet),SeriousWork,Do you have a Stack Overflow Careers 2.0 Profile?,JobSearchStatus,ProfessionalQuestion,PronounceGIF,MentalHealth,Knowledge_9,nan (Direct sales to companies),MgrWant,nan (Current Lang & Tech: Write-In),LearnCode,NewRole,EmploymentAddl,AssessJob3,AINextMuch more integrated,DevEnvsAdmired,Overpaid,nan (40 hour work week),FrameworkWorkedWith,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (I can recommend or request products),nan (Changed Jobs in last 12 Months),WantWorkLanguage,RightWrongWay,nan (Why use Stack Overflow: Love to learn),nan (C#),UK_Country,nan (Future Lang & Tech: Rust),nan (Industry),experience_range,nan (Future Lang & Tech: SQL),Select up to 3 (Appealing message traits: Message is personalized),AgreeDisagree3,nan (Future Lang & Tech: Ruby),JobContactPriorities4,TechEndorse_5,nan (LinkedIn Inmail),CodingActivities,How do you use Stack Overflow? (Read other people's questions to solve my problems),nan (Future Lang & Tech: AngularJS),nan (Most important aspect of new job opportunity: Quality of colleagues),programming_ability,VersionControl,ToolsTechWantToWorkWith,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",HackathonReasons,nan (Most annoying about job search: Writing and updating CV),ToolCountWork,nan (Android),AIToolCurrently Using,AIOpen,QuestionsInteresting,SOFriction,nan (PS4),TechOppose_11,SO_Actions_1,EquipmentSatisfiedCPU,experience_midpoint,DatabaseAdmired,nan (Twitter),Unnamed: 0,nan (Windows Tablet),AIAgentImpactSomewhat disagree,nan (Why answer: Sense of responsibility to developers),InvestTimeTools,MobileDeveloperType,WorkPayCare,TechEndorse_2,"nan (High Quality Office Space (amenities, lounge space, free food, etc))",nan (Current Lang & Tech: Objective-C),InfluenceWorkstation,InfluenceCommunication,nan (Sales / Marketing),AINextSomewhat different,nan (Perception of contact form: Phone),JobSatPoints_15_TEXT,SO_Actions_6,nan (Appealing message traits: Benefits & Perks),ProjectManagement,nan (Most annoying about job search: Interesting companies rarely respond),nan (Desktop Operating System: write-in),Gender,EducationParents,JobSatPoints_7,age_midpoint,ToolCountPersonal,collector,nan (Most annoying about job search: Taking time off work to interview),StackOverflowMetaChat,SurveyLong,ConvertedSalary,nan (Current Lang & Tech: Arduino),nan (Most urgent info about job opportunity: Colleagues),Age1stCode,StackOverflowCopiedCode,NEWLearn,nan (Current Lang & Tech: Java),important_companymission,How many developers are employed at your company?,StackOverflowConsiderMember,OffOn,EntTeams,nan (Future Lang & Tech: Cordova),nan (Source control used: Bitkeeper),nan (Training & Education: Industry certification),PlatformHaveWorkedWith,nan (Occupation),nan (Approver),nan (Current Lang & Tech: SQL),nan (Most important aspect of new job opportunity: Company reputation),nan (Stock Options/Profit Sharing Program),Select all that apply (Source control used: Git),Which technology products do you own? (You can choose more than one) (iPhone),nan (Current Lang & Tech: Scala),AssessJob7,JobContactPriorities2,OpenSource,BlockchainIs,WorkPlan,nan (Perception of recruiter contact),EducationTypes,AIAgentChallengesSomewhat disagree,nan (Answer questions I know the answer to),StackOverflowWhatDo,ErgonomicDevices,AdsAgreeDisagree3,LearnCodeCoursesCert,Please rate your job/career satisfaction,HypotheticalTools2,StackOverflowHasAccount,AIAgentKnowWrite,nan (Remote Status),nan (Source control used: CVS),nan (Current Lang & Tech: Salesforce),ImportantHiringEducation,Industry,visit_frequency,Select all that apply (Future Lang & Tech: Android),HaveWorkedLanguage,nan (Most annoying about job search: The Interview),JobFactors,"In receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (Message is personalized to me)",nan (I've taken a trial/purchased a product from ads),nan (How many caffeinated beverages per day?),nan (AppleTV),nan (Windows Phone),WorkLoc,Containers,AINextMuch less integrated,DevEnvWantEntry,nan (Future Lang & Tech: Swift),LastNewJob,JobContactPriorities5,StackOverflowJobSearch,nan (Wii U),nan (Future Lang & Tech: Clojure),ProfessionalCloud,AssessJobFinances,MgrIdiot,BuildvsBuy,OpSysProfessional use,StackOverflowHelpful,How many years of IT/Programming experience do you have?,ResumePrompted,DatabaseWorkedWith,important_variety,MajorUndergrad,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (Influencer),AIToolCurrently partially AI,nan (Other Stack Exchange (please specify)),OfficeStackHaveEntry,"nan (Positive Organization Structure (not much bureaucracy, helpful management))",nan (No Involvement),"nan (Quality of Workstation (dream machine, 30inch monitors, etc))",KinshipDevelopers,EnjoyDebugging,SO_Actions_15_TEXT,nan (MongoDB),nan (Perl),StackOverflowNewQuestion,nan (Android Phone),HoursComputer,Select all that apply (Why answer: Help a programmer in need),InfluenceDeptTech,SOVisitFreq,nan (I'm a Seller),nan (Appealing message traits: Stack Overflow activity mentioned),TechEndorse_9,ImportantBenefits,nan (Recruitment Tools & Services),TechEndorse_4,nan (Most important aspect of new job opportunity: Important decisions),ImportantHiringCommunication,Student,nan (Source control used: Mercurial),TechOppose_15_TEXT,TechEndorse_6,nan (Why answer: I don't answer but I want to),nan (Source control used: Legacy / Custom),nan (Software as a service / recurring billing),ProgramHobby
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,Start Up (1-25),< 20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mobile,,,,Software,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student / Unemployed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$25,001 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,,,,,,<$100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Africa,,,,,,,,,Consulting,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Approver,,,,,iPhone,,,,,,,,,,,,,,,FML,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,<2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,Mature Small Business (25-100),25-29,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows 7,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other netbook,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41310,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,Mid Sized (100-999),25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SaaS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,JavaScript,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,India,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,I'm a Seller,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,Student,< 20,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,Wii,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,2011,,,,,,Haskell,,,,,,,,,Regular Mobile Phone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student / Unemployed,,,,,,,,,,,,,,,,,,Kindle,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Germany,,,,,,,,,Foundation / Non-Profit,,,,,,,Other gaming system,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41310,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Servers,,,,,,,,,,,,,,,,,,,Hardware,,,,,,,,,I run this place,,,,,,,,,,,,,,,Start Up (1-25),35-39,,,,,,,,,,,,Stack Overflow,CSS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$10,001 - $25,000",,,,,,,,,,,,,,">$150,000",,,,,,,,,,,,,,,,,,,,,,"$100,001 - $150,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,Xbox,,,,,,,,,,,,,"Executive (VP of Eng, CTO, CIO, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,Software,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,2011,,,,,,,,,,,,,,,Regular Mobile Phone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,,,,"$80,000 - $100,000",,,,,,,,,,,,,,,,,,Kindle,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Don't know,,,,,,,,,,,,,,,,,"$25,001 - $40,000","$75,001 - $100,000",,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,JavaScript,,,,,,,,$251-$500,,,,,,,,,,C++,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Purchaser,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,"$41,000 - $75,000",,,,,,,,,,,,,,,,,,PHP,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,Other Asia,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Android,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Approver,,,,,,,,,,,,,,,,,,,,It pays the bills,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,Perl,,,,,,,,,,,,,,,,,,,,,,


In [10]:
# Create a stratified sample of 20% of the data
if combined_df is not None:
    # Calculate 20% sample size for each year
    sample_size = 0.2
    
    # Perform stratified sampling
    stratified_sample = combined_df.groupby('SurveyYear', group_keys=False).apply(
        lambda x: x.sample(frac=sample_size, random_state=42)
    ).reset_index(drop=True)
    
    # Export to CSV
    output_file = 'stackoverflow_survey_stratified_sample.csv'
    stratified_sample.to_csv(output_file, index=False)
    
    print(f"Original dataset size: {len(combined_df):,} rows")
    print(f"Sampled dataset size: {len(stratified_sample):,} rows")
    print(f"\nSample size by year:")
    print(stratified_sample['SurveyYear'].value_counts().sort_index())
    print(f"\nData exported to: {output_file}")

Original dataset size: 772,599 rows
Sampled dataset size: 154,521 rows

Sample size by year:
SurveyYear
2011      563
2012     1249
2013     1948
2014     1529
2015     5217
2016    11206
2017    10278
2018    19771
2019    17777
2020    12892
2021    16688
2022    14654
2023    17837
2024    13087
2025     9825
Name: count, dtype: int64

Data exported to: stackoverflow_survey_stratified_sample.csv


In [11]:
stratified_sample.to_csv('stackoverflow_survey_stratified_sample.csv')

## Adjust df_use based on the dataframe you wwant to use (sample_df, stratified_sample, etc)

In [4]:
sample_df = pd.read_csv('stackoverflow_survey_stratified_sample.csv')

In [5]:
df_use = sample_df.copy()

In [6]:
# Remove any columns with 'Unnamed' in their name from the combined dataframe
if df_use is not None:
    df_use = df_use.loc[:, ~df_use.columns.str.contains('^Unnamed')]

In [7]:
# Remove columns with 'nan' in their names from combined_df and individual year dataframes
if df_use is not None:
    # For combined dataframe
    nan_columns = df_use.columns[df_use.columns.str.contains('nan', case=False, na=False)]
    if len(nan_columns) > 0:
        print("Removing columns containing 'nan' from combined dataframe:")
        print(list(nan_columns))
        df_use = df_use.drop(columns=nan_columns)

print("\nDone cleaning column names.")

Removing columns containing 'nan' from combined dataframe:
['nan (Training & Education: Other)', 'nan (Training & Education: Masters in CS)', 'nan (Fixing bugs)', 'nan (Future Lang & Tech: R)', 'nan (Desktop Operating System)', 'nan (Why try Stack Overflow Careers: Selection of revelant jobs)', 'nan (TypeScript)', 'nan (Servers)', 'nan (Perception of contact form: Twitter)', 'nan (Current Lang & Tech: Cordova)', 'nan (Link to a Stack Overflow Careers Company Page or other source of more information about the company (videos, articles, etc))', 'nan (How frequently land on or read Stack Overflow)', 'nan (Current Lang & Tech: SQL Server)', 'nan (Future Lang & Tech: Redis)', 'nan (Commuting)', 'nan (Limited night / weekend work)', 'nan (Future Lang & Tech: Scala)', 'nan (Current Lang & Tech: Redis)', 'nan (Appealing message traits: Stack Overflow Company Page)', 'nan (Most urgent info about job opportunity: Job title)', 'nan (Node.js)', 'nan (Recommender)', 'nan (Other tablet)', 'nan', 'na

In [8]:
# Update all column names in combined_df and each dataframe in dataframes to be lower case
df_use.columns = [col.lower() for col in df_use.columns]

In [9]:
df_use.head()

Unnamed: 0,importanthiringrep,equipmentsatisfiedram,eduother,hoursperweek,learncodeai,jobsatpoints_16,company_size_range,webframewantentry,assessjobcompensation,techoppose_2,why_stack_overflow,languagedesirenextyear,ethicalimplications,comptotal,knowledge_6,yearscodepro,aitooldon't plan to use ai for this task,purchasehow,dogs_vs_cats,agreedisagree2,equipmentsatisfiedmonitors,please rate the advertising you've seen on stack overflow (the ads are relevant),fizzbuzz,influenceviztools,stackoverflowmoderation,haveworkedplatform,employment_status,interview_likelihood,equipmentsatisfiedrw,sovisit1st,job_discovery,what types of purchases are you involved in? (hardware),databasedesirenextyear,currencydesc,databasewanttoworkwith,workchallenge,icorpm,how likely is it that a recommendation you make will be acted upon?,webframeadmired,assessjobdiversity,employmentstatus,knowledge_8,methodology,adspriorities2,socomm,aiagentchallengesneutral,aiselect,which best describes the size of your company?,how old are you?,problemsolving,frustration,platformworkedwith,currencysymbol,adsagreedisagree2,accessibility,"you answered you don't have a careers profile, can you elaborate why?",wantworkdatabase,which of our sites do you frequent most?,stackoverflowdevices,questionsconfusing,how many people work for your company?,blockchainorg,salary_midpoint,purchaseinfluence,how often do you find solutions to your programming problems on stack overflow without asking a new question?,big_mac_index,remote,jobemailpriorities1,techendorse,techendorse_1,languagechoice,highesteducationparents,tabsspaces,companysize,formaleducation,stackoverflowrecommend,skipmeals,employment,developertype,how do you prefer to be contacted about job opportunities? (email),ethnicity,timesearching,country,assessjobindustry,difficultcommunication,languageworkedwith,aisent,platformwanttoworkwith,cousineducation,sotimesaved,commplatformhaveentr,agree_adblocker,sojobs,stackoverflowadsrelevant,hypotheticaltools1,inthezone,surveyeasy,webframehaveentry,knowledge_5,webframedesirenextyear,jobsatpoints_1,do you have a stack overflow careers profile?,exercise,how often do you visit job boards?,jobemailpriorities7,assessjob5,annoyingui,databasewantentry,tbranch,aidevhaveworkedwith,aiben,assessjob8,assessjob4,webframeworkedwith,influenceconsultants,knowledge_4,jobemailpriorities2,waketime,education,aiexplain,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",aiagentorchwrite,aihuman,stackoverflowcompanypage,officestacksyncwanttoworkwith,updatecv,important_wfh,numbermonitors,resumeupdate,which us state or territory do you live in?,stackoverflowadsdistracting,languageadmired,importanthiringpmexp,assessbenefits4,hobbyist,yearscodingprof,embeddedhaveworkedwith,techoppose_7,aiagentchallengessomewhat agree,adspriorities1,which of the following best describes your occupation?,mainbranch,jobsatpoints_13,respondent,important_ownoffice,influencecloud,aimodelswantentry,team_size_range,aidangerous,embeddedadmired,assessjobtech,hopefiveyears,aitoolinterested in using,ainextless integrated,devenvshaveworkedwith,jobsatpoints_10,assessjobproduct,occupation_group,important_promotion,techoppose_16,so_region,databasechoice,excoderwillnotcode,welcomechange,new_job_value,how would you best describe the industry you currently work in?,competepeers,aiethics,techoppose_13,newovertime,what other departments / roles do you interact with regularly? (system administrators),toolstechhaveworkedwith,us_state,so_actions_10,stackoverflowdescribes,aimodelshaveentry,excoderbelonged,assessjob2,important_control,agree_notice,assessjob6,select up to 3 (most annoying about job search: finding time),toolstechadmired,jobemailpriorities4,devenvswanttoworkwith,importanthiringtechexp,collaborateremote,importanthiringgettingthingsdone,agree_nightcode,metricassess,diversityimportant,vchostingprofessional use,versioncontrolsystem,newstuck,how large is the team that you work on?,importanthiringopensource,sohowmuchtime,excoderskills,what is your current stack overflow reputation?,frequency_3,sotagshaveentry,purchasewhat,which technologies are you excited about? (node.js),what type of project are you developing?,assessjob10,webframehaveworkedwith,sofindanswer,yearscodedjob,influencetechstack,techendorse_3,newcollabtoolshaveworkedwith,remotework,officestacksynchaveworkedwith,languageshaveentry,techendorse_7,aiagents,stackoverflowparticipate,compfreq,vchostingpersonal use,open_to_new_job,workstart,race,trans,workexp,newofftopic,stackoverflowcommunity,surveylength,desktop_os,jobprofile,developer_challenges,why_learn_new_tech,aisearchdevadmired,aiagentchange,age_range,buildingthings,assessjoboffice,jobemailpriorities3,aiagentimpactstrongly disagree,aitoolplan to partially use ai,responseid,what operating system do you use the most?,what advertisers do you remember seeing on stack overflow? (open-ended response),socialmedia,platformhaveentry,surveyyear,jobsatpoints_8,importanthiringalgorithms,ethicsresponsible,officestackasynchaveworkedwith,officestackasyncadmired,officestackwantentry,equipmentsatisfiedstorage,buynewtool,so_actions_4,blockchain,agree_tech,unittests,betterlife,hoursoutside,workweekhrs,adspriorities6,newjobhunt,commplatformwanttoworkwith,sexualorientation,knowledge_2,jobsecurity,influenceservers,stackoverflowanswer,sopartfreq,job_satisfaction,assessbenefits7,devtype,commit_frequency,newcollabtoolsworkedwith,age,newdevopsimpt,jobseek,commplatformhaveworkedwith,select up to 3 (how can companies improve interview process: more live code),learncodeonline,adsactions,misctechadmired,importanthiringcompanies,so_actions_16,commplatformwantentr,understandcomputers,which desktop operating system do you use the most?,assessjobremote,aifrustration,aimodelschoice,sotagswanttoworkwith,salary_range,challengemyself,frequency_2,ide,assessbenefits5,"in an average week, how do you spend your time at work? (developing new features)",clickykeys,stackoverflowdevstory,assessjobexp,techlist,ainextvery similar,so_actions_5,knowledge_7,stackoverflowsatisfaction,what best describes your career / job satisfaction?,stackoverflowbetter,stackoverflowvisit,assessbenefits8,jobsat,influencedatabase,surveyease,how often are you contacted by recruiters?,adspriorities4,coderev,assessbenefits3,newothercomms,embeddedwanttoworkwith,commplatformadmired,aimodelsadmired,excodernotforme,careersatisfaction,devenvschoice,sonewcontent,aithreat,platformadmired,aimodelswanttoworkwith,boringdetails,sexuality,adspriorities3,what country do you live in?,haveworkeddatabase,how did you find out about your current job?,misctechhaveworkedwith,salarytype,important_buildexisting,truefalse_3,adblockerdisable,jobsatisfaction,are you currently looking for a job or open to new opportunities?,"including bonus, what is your annual compensation in usd?",assessjobleaders,jobemailpriorities5,select all that apply (training & education: no formal training),yearscode,aiacc,haveworkedframework,self_identification,platformchoice,sohow,important_sameend,webframechoice,so_actions_3,newcollabtoolsdesirenextyear,databasehaveworkedwith,misctechworkedwith,timeanswering,languageswantentry,stackoverflowjoblisting,occupation,webdevelopertype,jobcontactpriorities3,ainextsomewhat similar,sotagshaveworkedwith,assessjobprojects,aisearchdevwanttoworkwith,techoppose_5,women_on_team,currency,auditoryenvironment,aiagentchallengesstrongly agree,ethicschoice,onboarding,aichallenges,rep_range,learnedhiring,stackoverflowjobs,assessjobcommute,sotagsadmired,wantworkplatform,orgsize,sovisitto,assessbenefits6,influenceinternet,vcinteraction,knowledge_1,unit_testing,professional,ethicsreport,techendorse_13_text,ainextno change,aidevwanttoworkwith,select all that apply (why try stack overflow careers: no spam),jobsatpoints_14,adsagreedisagree1,assessjob1,communicationtools,hypotheticaltools4,newedimpt,newdevops,frequency_1,excoderbalance,hypotheticaltools5,edlevel,influencerecruitment,adspriorities5,stackoverflowjobsrecommend,importanthiringtitles,knowledge_3,"if your company has a native mobile app, what platforms do you support? (iphone)",frameworkdesirenextyear,officestacksyncadmired,aitoolcurrently mostly ai,stackoverflowfoundanswer,assessbenefits1,operatingsystem,adblockerreasons,militaryus,webframedesirenextyear.1,so_dev_content,were you aware of the apptivate contest?,which languages are you proficient in? (java),otherpeoplescode,aicomplex,opsys,languagewanttoworkwith,did you participate in the apptivate contest?,where do you work remotely most of the time?,salary,techdoc,"in the last 12 months, how much money have you spent on personal technology-related purchases?",job_search_annoyance,aisearchhaveworkedwith,databasehaveentry,screenname,extraversion,sotagswant entry,techoppose_3,aiagentimpactneutral,assessbenefits2,aisearchwanttoworkwith,newonboardgood,aisearchdevhaveworkedwith,agree_legacy,aiagentextwrite,yearscoding,ainextvery different,convertedcomp,mgrmoney,yearsprogram,newsosites,q120,select up to 3 (most important aspect of new job opportunity: salary),techendorseintro,have you changed jobs in the last 12 months?,selftaughttypes,jobsatpoints_15,jobsatpoints_4,platformwantentry,agree_mars,timeafterbootcamp,influencehardware,aifuture,aitoolnot interested in using,ailearnhow,jobsatpoints_9,do you work remotely?,assessjob9,dev_environment,agentusesgeneral,so_actions_7,homeremote,agree_problemsolving,university,assessbenefits10,platformdesirenextyear,opensourcer,jobseekingstatus,do you enjoy working remotely?,assessjobprofdevel,workremote,coderevhrs,webframeworkedwith.1,interestedanswers,languagehaveworkedwith,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",soaccount,excoderreturn,adblocker,what is your gender?,nondevelopertype,companytype,raceethnicity,aiagentchallengesstrongly disagree,aiagent_uses,industry,newpurplelink,aiagentexternal,gender,"including yourself, how many developers are employed at your company?",jobcontactpriorities1,learncodechoose,aliens,techendorse_8,important_newtech,"in an average week, how do you spend your time? (developing new features)",yearscodedjobpast,so_actions_15,checkincode,aiagentobswrite,soduration,learningnewtech,ainextmore integrated,select all that apply (most urgent info about job opportunity: salary),agree_diversity,aiagentknowledge,tech_want,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",devenviron,have you visited / are you aware of stack overflow careers?,tech_do,select all that apply (current lang & tech: android),opsyspersonal use,aiagentobservesecure,assessbenefits11,excoder10years,misctechdesirenextyear,have you visited / are you aware of stack overflow careers 2.0?,agree_alcohol,hobby,hypotheticaltools3,which of the following languages or technologies have you used significantly in the past year? (c),shipit,careersat,newjobhuntresearch,what is your involvement in purchasing? you can choose more than 1. (influencer),un_subregion,professionaltech,agree_loveboss,jobsatpoints_11,newcollabtoolsadmired,assessjobrole,what country or region do you live in?,jobemailpriorities6,misctechwanttoworkwith,agreedisagree1,aiagentimpactstrongly agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",how would you best describe the industry you work in?,how_to_improve_interview_process,dependents,convertedcompyearly,wantworkframework,select all that apply (why use stack overflow: help for job),ainextneither different nor similar,educationimportant,soai,techoppose_1,aiagentimpactsomewhat agree,jobsatpoints_5,officestackasyncwanttoworkwith,important_buildnew,itperson,techoppose_9,aimodelshaveworkedwith,timefullyproductive,webframewanttoworkwith,hobby.1,aiagentorchestration,airesponsible,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",stackoverflowmakemoney,adspriorities7,impsyn,star_wars_vs_star_trek,assessjobdept,lastint,techoppose_15,undergradmajor,jobsatpoints_6,changeworld,truefalse_2,newcollabtoolswanttoworkwith,assessbenefits9,check,excoderactive,truefalse_1,friendsdevelopers,devenvhaveentry,lasthiredate,country.1,techendorse_13,so_actions_9,aiinteresting,surveytoolong,expectedsalary,aitoolplan to mostly use ai,newpurchaseresearch,seriouswork,do you have a stack overflow careers 2.0 profile?,jobsearchstatus,professionalquestion,pronouncegif,mentalhealth,knowledge_9,mgrwant,learncode,newrole,employmentaddl,assessjob3,ainextmuch more integrated,devenvsadmired,overpaid,frameworkworkedwith,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),wantworklanguage,rightwrongway,uk_country,experience_range,select up to 3 (appealing message traits: message is personalized),agreedisagree3,jobcontactpriorities4,techendorse_5,codingactivities,how do you use stack overflow? (read other people's questions to solve my problems),programming_ability,versioncontrol,toolstechwanttoworkwith,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",hackathonreasons,toolcountwork,aitoolcurrently using,aiopen,questionsinteresting,sofriction,techoppose_11,so_actions_1,equipmentsatisfiedcpu,experience_midpoint,databaseadmired,aiagentimpactsomewhat disagree,investtimetools,mobiledevelopertype,workpaycare,techendorse_2,influenceworkstation,influencecommunication,ainextsomewhat different,jobsatpoints_15_text,so_actions_6,projectmanagement,gender.1,educationparents,jobsatpoints_7,age_midpoint,toolcountpersonal,collector,stackoverflowmetachat,surveylong,convertedsalary,age1stcode,stackoverflowcopiedcode,newlearn,important_companymission,how many developers are employed at your company?,stackoverflowconsidermember,offon,entteams,platformhaveworkedwith,select all that apply (source control used: git),which technology products do you own? (you can choose more than one) (iphone),assessjob7,jobcontactpriorities2,opensource,blockchainis,workplan,educationtypes,aiagentchallengessomewhat disagree,stackoverflowwhatdo,ergonomicdevices,adsagreedisagree3,learncodecoursescert,please rate your job/career satisfaction,hypotheticaltools2,stackoverflowhasaccount,aiagentknowwrite,importanthiringeducation,industry.1,visit_frequency,select all that apply (future lang & tech: android),haveworkedlanguage,jobfactors,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",workloc,containers,ainextmuch less integrated,devenvwantentry,lastnewjob,jobcontactpriorities5,stackoverflowjobsearch,professionalcloud,mgridiot,buildvsbuy,opsysprofessional use,stackoverflowhelpful,how many years of it/programming experience do you have?,resumeprompted,databaseworkedwith,important_variety,majorundergrad,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),aitoolcurrently partially ai,officestackhaveentry,kinshipdevelopers,enjoydebugging,so_actions_15_text,stackoverflownewquestion,hourscomputer,select all that apply (why answer: help a programmer in need),influencedepttech,sovisitfreq,techendorse_9,importantbenefits,techendorse_4,importanthiringcommunication,student,techoppose_15_text,techendorse_6,programhobby
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,Start Up (1-25),30-34,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,California,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,United States of America,,,,,,Web Services,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,Start Up (1-25),40-50,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$40,000 - $60,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,"$2,001-$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,Mature Small Business (25-100),30-34,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows 7,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$20,000 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,Influencer,,,,,,,South America,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,Mid Sized (100-999),30-34,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$60,000 - $80,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Asia,,,,,,Healthcare,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FML,,,,,,,,,,,,,,,,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,"Other (not working, consultant, etc.)",25-29,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$20,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
# Find columns containing 'country'
country_columns = [col for col in df_use.columns if 'country' in str(col).lower()]

print("Columns containing 'country':")
for col in country_columns:
    try:
        # Select the column(s). If multiple columns share the same name this returns a DataFrame.
        selected = df_use.loc[:, col]
        # If a DataFrame is returned (duplicate column names), collapse to a single Series by taking
        # the first non-null value across duplicates for each row.
        if isinstance(selected, pd.DataFrame):
            if selected.shape[1] > 1:
                print(f"\nWarning: column name '{col}' is duplicated ({selected.shape[1]} columns). Combining duplicates by taking first non-null value.")
            series = selected.bfill(axis=1).iloc[:, 0]
        else:
            series = selected.squeeze()

        print(f"\nColumn: {col}")
        print("Top 5 most common values and their counts:")
        counts = series.fillna('NULL').value_counts().head()
        print(counts)

        # Get unique count excluding nulls
        unique_count = series.dropna().nunique()
        print(f"\nTotal unique values (excluding nulls): {unique_count}")
        print(f"Number of null values: {series.isnull().sum()}")
        print("-" * 50)
    except Exception as e:
        print(f"\nError processing column {col}: {str(e)}")
        print("-" * 50)


Columns containing 'country':


Column: country
Top 5 most common values and their counts:
country
United States               15827
NULL                        15330
India                       14075
United States of America    13283
Germany                     10046
Name: count, dtype: int64

Total unique values (excluding nulls): 235
Number of null values: 15330
--------------------------------------------------

Column: what country do you live in?
Top 5 most common values and their counts:
what country do you live in?
NULL              152992
United States        400
India                166
United Kingdom       143
Germany               85
Name: count, dtype: int64

Total unique values (excluding nulls): 80
Number of null values: 152992
--------------------------------------------------

Column: what country or region do you live in?
Top 5 most common values and their counts:
what country or region do you live in?
NULL                        150762
United States of America      1

In [11]:
# Dictionary of column groups to combine
column_groups = {
    'years_coding': ['yearscode', 'yearscodingprof', 'yearscodepro', 'work_experience', 'yearsprogram', 'yearscodedjob', 'how many years of it/programming experience do you have?'],
    'education': ['edlevel', 'education', 'formaleducation'],
    'employment': ['employment', 'employmentstatus', 'employment_status'],
    'company_size': ['companysize', 'company_size_range', 'orgsize', 'companyemployeesrange', 'how many people work for your company?', 'which best describes the size of your company?'],
    'salary': ['convertedsalary', 'convertedcomp', 'comptotal'],
    'job_satisfaction': ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction', 'please rate your job/career satisfaction'],
    'job_title': ['jobtitle', 'currentjobtitle', 'jobprofile'],
    'developer_type': ['developertype', 'devtype'],
    'industry': ['industry', 'industrytype', 'companytype', 'how would you best describe the industry you work in?', 'how would you best describe the industry you currently work in?'],
    'country': ['country', 'location', 'countrycode', 'what country or region do you live in?', 'what country do you live in?'],
    'programming_experience': ['yearscode', 'yearscoding', 'codingexperience'],
    'database_worked_with': ['databaseworkedwith', 'dbworkedwith'],
    'dev_environment': ['ide', 'developmentenvironment', 'dev_environment', 'devenviron', 'devenvironment'],
    'operating_sys': ['opsys', 'operatingsystem', 'os', 'what operating system do you use the most?', 'which desktop operating system do you use the most?'],
    'dev_methodology': ['methodology', 'devmethodology', 'developmentmethodology'],
    'communication_tools': ['communicationtools', 'collaboration', 'collabtools'],
    'gender': ['gender', 'sex', 'what is your gender?'],
    'age': ['age', 'agerange','age_range','agegrouping', 'how old are you?'],
    'learning': ['learncode', 'learncodehow', 'learningmethod'],
    'work_experience': ['workexp', 'experience', 'yearsexperience'],
    'remote': ['remotework', 'workremote', 'remotestatus', 'do you work remotely?', 'homeremote', 'remote'],
    'team_size': ['teamsize', 'orgteamsize', 'developmentteamsize', 'how large is the team that you work on?'],
    'survey_easy': ['surveyease', 'surveyeasy', 'surveylong', 'surveytoolong', 'surveylength'],
    'version_control_sys': ['versioncontrol', 'versioncontrolsystem', 'vcs'],
    'currency': ['currency', 'currencydesc'],
    'hobby': ['hobby', 'hobbyist'],
    'race': ['race', 'raceethnicity', 'self_identification']
}

In [12]:
df_use_2 = df_use.copy()

In [13]:
# Identify and combine duplicate columns
def combine_duplicate_columns(df):
    """
    Identifies columns with the same name (case-insensitive), combines their data
    into a single column by taking the first non-null value, and removes the originals.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with duplicate columns combined.
    """
    df_combined_duplicates = pd.DataFrame(index=df.index)
    processed_columns = set()

    for col_name in df.columns:
        col_name_lower = col_name.lower()

        if col_name_lower not in processed_columns:
            # Find all columns with this name (case-insensitive)
            duplicate_columns = [col for col in df.columns if col.lower() == col_name_lower]

            if len(duplicate_columns) > 1:
                print(f"Combining duplicate columns for '{col_name_lower}': {duplicate_columns}")
                # Select the duplicate columns
                selected_duplicates = df[duplicate_columns]
                # Combine by taking the first non-null value across rows
                combined_series = selected_duplicates.bfill(axis=1).iloc[:, 0]
                df_combined_duplicates[col_name_lower] = combined_series
                # Add to processed set
                processed_columns.add(col_name_lower)
            else:
                # Not a duplicate, just add the column
                df_combined_duplicates[col_name_lower] = df[col_name]
                processed_columns.add(col_name_lower)

    return df_combined_duplicates

# Apply the function to combine duplicate columns in df_use_2
df_use_combined_duplicates = combine_duplicate_columns(df_use_2)

print("\nOriginal DataFrame shape:", df_use_2.shape)
print("DataFrame shape after combining duplicates:", df_use_combined_duplicates.shape)

Combining duplicate columns for 'country': ['country', 'country']
Combining duplicate columns for 'webframedesirenextyear': ['webframedesirenextyear', 'webframedesirenextyear']
Combining duplicate columns for 'webframeworkedwith': ['webframeworkedwith', 'webframeworkedwith']
Combining duplicate columns for 'industry': ['industry', 'industry']
Combining duplicate columns for 'gender': ['gender', 'gender']
Combining duplicate columns for 'hobby': ['hobby', 'hobby']

Original DataFrame shape: (154521, 729)
DataFrame shape after combining duplicates: (154521, 723)


In [14]:
def combine_columns(df, column_list):
    """
    Combines data from a list of columns into a single Series,
    taking the first non-null value across the columns for each row.

    Args:
        df: The input pandas DataFrame.
        column_list: A list of column names to combine.

    Returns:
        A pandas Series containing the combined data.
    """
    # Select the specified columns
    selected_columns = df[column_list]

    # Combine columns by taking the first non-null value across rows
    combined_series = selected_columns.bfill(axis=1).iloc[:, 0]

    return combined_series

In [15]:
# Initialize an empty dictionary to store consolidated columns
consolidated_columns_dict = {}

# Iterate through the column_groups dictionary
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]

    # If there are columns from the current group present in the DataFrame
    if present_columns:
        print(f"Processing group '{group_name}' with columns: {present_columns}")
        # Call the combine_columns function
        combined_series = combine_columns(df_use_combined_duplicates, present_columns)
        # Store the resulting combined Series in the dictionary
        consolidated_columns_dict[group_name] = combined_series
    else:
        print(f"No columns found for group '{group_name}' in the DataFrame.")

# Create a new DataFrame from the dictionary of consolidated columns
df_consolidated = pd.DataFrame(consolidated_columns_dict)

print("\nConsolidated DataFrame created.")
print(f"Shape of consolidated DataFrame: {df_consolidated.shape}")
df_consolidated.head()

Processing group 'years_coding' with columns: ['yearscode', 'yearscodingprof', 'yearscodepro', 'yearsprogram', 'yearscodedjob', 'how many years of it/programming experience do you have?']
Processing group 'education' with columns: ['edlevel', 'education', 'formaleducation']
Processing group 'employment' with columns: ['employment', 'employmentstatus', 'employment_status']
Processing group 'company_size' with columns: ['companysize', 'company_size_range', 'orgsize', 'how many people work for your company?', 'which best describes the size of your company?']
Processing group 'salary' with columns: ['convertedsalary', 'convertedcomp', 'comptotal']
Processing group 'job_satisfaction' with columns: ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction', 'please rate your job/career satisfaction']
Processing group 'job_title' with columns: ['jobprofile']
Processing group 'developer_type' with columns: ['developertype', 'devtype']
Processing group 'industry' with columns: ['industry', 'c

Unnamed: 0,years_coding,education,employment,company_size,salary,job_satisfaction,job_title,developer_type,industry,country,programming_experience,database_worked_with,dev_environment,operating_sys,dev_methodology,communication_tools,gender,age,learning,work_experience,remote,team_size,survey_easy,version_control_sys,currency,hobby,race
0,11,,,Start Up (1-25),,So happy it hurts,,,Web Services,United States of America,,,,Linux,,,,30-34,,,,,,,,,
1,11,,,Start Up (1-25),,I enjoy going to work,,,Software Products,Other Europe,,,,Mac OS X,,,,40-50,,,,,,,,,
2,11,,,Mature Small Business (25-100),,So happy it hurts,,,Software Products,South America,,,,Windows 7,,,,30-34,,,,,,,,,
3,41435,,,Mid Sized (100-999),,FML,,,Healthcare,Other Asia,,,,Mac OS X,,,,30-34,,,,,,,,,
4,41310,,,"Other (not working, consultant, etc.)",,I enjoy going to work,,,Other,Other Europe,,,,Linux,,,,25-29,,,,,,,,,


In [16]:
# Create a list of all original column names that were combined
original_columns_to_drop = []
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]
    original_columns_to_drop.extend(present_columns)

# Remove duplicates from the list of columns to drop
original_columns_to_drop = list(set(original_columns_to_drop))

# Drop these original columns from the df_use_combined_duplicates DataFrame
df_use_combined_duplicates_dropped = df_use_combined_duplicates.drop(columns=original_columns_to_drop, errors='ignore')

print("Original columns that were combined have been dropped.")
print(f"Shape of df_use_combined_duplicates after dropping original columns: {df_use_combined_duplicates_dropped.shape}")

Original columns that were combined have been dropped.
Shape of df_use_combined_duplicates after dropping original columns: (154521, 651)


In [17]:
# Concatenate the remaining columns from df_use_combined_duplicates_dropped with the consolidated columns
df_all = pd.concat([df_use_combined_duplicates_dropped, df_consolidated], axis=1)

print("\nFinal DataFrame created by concatenating remaining original columns and consolidated columns.")
print(f"Shape of the final DataFrame: {df_all.shape}")

# Display the first few rows of the final dataframe
print("\nFirst 5 rows of the final DataFrame:")
display(df_all.head())


Final DataFrame created by concatenating remaining original columns and consolidated columns.
Shape of the final DataFrame: (154521, 678)

First 5 rows of the final DataFrame:


Unnamed: 0,importanthiringrep,equipmentsatisfiedram,eduother,hoursperweek,learncodeai,jobsatpoints_16,webframewantentry,assessjobcompensation,techoppose_2,why_stack_overflow,languagedesirenextyear,ethicalimplications,knowledge_6,aitooldon't plan to use ai for this task,purchasehow,dogs_vs_cats,agreedisagree2,equipmentsatisfiedmonitors,please rate the advertising you've seen on stack overflow (the ads are relevant),fizzbuzz,influenceviztools,stackoverflowmoderation,haveworkedplatform,interview_likelihood,equipmentsatisfiedrw,sovisit1st,job_discovery,what types of purchases are you involved in? (hardware),databasedesirenextyear,databasewanttoworkwith,workchallenge,icorpm,how likely is it that a recommendation you make will be acted upon?,webframeadmired,assessjobdiversity,knowledge_8,adspriorities2,socomm,aiagentchallengesneutral,aiselect,problemsolving,frustration,platformworkedwith,currencysymbol,adsagreedisagree2,accessibility,"you answered you don't have a careers profile, can you elaborate why?",wantworkdatabase,which of our sites do you frequent most?,stackoverflowdevices,questionsconfusing,blockchainorg,salary_midpoint,purchaseinfluence,how often do you find solutions to your programming problems on stack overflow without asking a new question?,big_mac_index,jobemailpriorities1,techendorse,techendorse_1,languagechoice,highesteducationparents,tabsspaces,stackoverflowrecommend,skipmeals,how do you prefer to be contacted about job opportunities? (email),ethnicity,timesearching,assessjobindustry,difficultcommunication,languageworkedwith,aisent,platformwanttoworkwith,cousineducation,sotimesaved,commplatformhaveentr,agree_adblocker,sojobs,stackoverflowadsrelevant,hypotheticaltools1,inthezone,webframehaveentry,knowledge_5,webframedesirenextyear,jobsatpoints_1,do you have a stack overflow careers profile?,exercise,how often do you visit job boards?,jobemailpriorities7,assessjob5,annoyingui,databasewantentry,tbranch,aidevhaveworkedwith,aiben,assessjob8,assessjob4,webframeworkedwith,influenceconsultants,knowledge_4,jobemailpriorities2,waketime,aiexplain,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",aiagentorchwrite,aihuman,stackoverflowcompanypage,officestacksyncwanttoworkwith,updatecv,important_wfh,numbermonitors,resumeupdate,which us state or territory do you live in?,stackoverflowadsdistracting,languageadmired,importanthiringpmexp,assessbenefits4,embeddedhaveworkedwith,techoppose_7,aiagentchallengessomewhat agree,adspriorities1,which of the following best describes your occupation?,mainbranch,jobsatpoints_13,respondent,important_ownoffice,influencecloud,aimodelswantentry,team_size_range,aidangerous,embeddedadmired,assessjobtech,hopefiveyears,aitoolinterested in using,ainextless integrated,devenvshaveworkedwith,jobsatpoints_10,assessjobproduct,occupation_group,important_promotion,techoppose_16,so_region,databasechoice,excoderwillnotcode,welcomechange,new_job_value,competepeers,aiethics,techoppose_13,newovertime,what other departments / roles do you interact with regularly? (system administrators),toolstechhaveworkedwith,us_state,so_actions_10,stackoverflowdescribes,aimodelshaveentry,excoderbelonged,assessjob2,important_control,agree_notice,assessjob6,select up to 3 (most annoying about job search: finding time),toolstechadmired,jobemailpriorities4,devenvswanttoworkwith,importanthiringtechexp,collaborateremote,importanthiringgettingthingsdone,agree_nightcode,metricassess,diversityimportant,vchostingprofessional use,newstuck,importanthiringopensource,sohowmuchtime,excoderskills,what is your current stack overflow reputation?,frequency_3,sotagshaveentry,purchasewhat,which technologies are you excited about? (node.js),what type of project are you developing?,assessjob10,webframehaveworkedwith,sofindanswer,influencetechstack,techendorse_3,newcollabtoolshaveworkedwith,officestacksynchaveworkedwith,languageshaveentry,techendorse_7,aiagents,stackoverflowparticipate,compfreq,vchostingpersonal use,open_to_new_job,workstart,trans,newofftopic,stackoverflowcommunity,desktop_os,developer_challenges,why_learn_new_tech,aisearchdevadmired,aiagentchange,buildingthings,assessjoboffice,jobemailpriorities3,aiagentimpactstrongly disagree,aitoolplan to partially use ai,responseid,what advertisers do you remember seeing on stack overflow? (open-ended response),socialmedia,platformhaveentry,surveyyear,jobsatpoints_8,importanthiringalgorithms,ethicsresponsible,officestackasynchaveworkedwith,officestackasyncadmired,officestackwantentry,equipmentsatisfiedstorage,buynewtool,so_actions_4,blockchain,agree_tech,unittests,betterlife,hoursoutside,workweekhrs,adspriorities6,newjobhunt,commplatformwanttoworkwith,sexualorientation,knowledge_2,jobsecurity,influenceservers,stackoverflowanswer,sopartfreq,assessbenefits7,commit_frequency,newcollabtoolsworkedwith,newdevopsimpt,jobseek,commplatformhaveworkedwith,select up to 3 (how can companies improve interview process: more live code),learncodeonline,adsactions,misctechadmired,importanthiringcompanies,so_actions_16,commplatformwantentr,understandcomputers,assessjobremote,aifrustration,aimodelschoice,sotagswanttoworkwith,salary_range,challengemyself,frequency_2,assessbenefits5,"in an average week, how do you spend your time at work? (developing new features)",clickykeys,stackoverflowdevstory,assessjobexp,techlist,ainextvery similar,so_actions_5,knowledge_7,stackoverflowsatisfaction,what best describes your career / job satisfaction?,stackoverflowbetter,stackoverflowvisit,assessbenefits8,jobsat,influencedatabase,how often are you contacted by recruiters?,adspriorities4,coderev,assessbenefits3,newothercomms,embeddedwanttoworkwith,commplatformadmired,aimodelsadmired,excodernotforme,devenvschoice,sonewcontent,aithreat,platformadmired,aimodelswanttoworkwith,boringdetails,sexuality,adspriorities3,haveworkeddatabase,how did you find out about your current job?,misctechhaveworkedwith,salarytype,important_buildexisting,truefalse_3,adblockerdisable,are you currently looking for a job or open to new opportunities?,"including bonus, what is your annual compensation in usd?",assessjobleaders,jobemailpriorities5,select all that apply (training & education: no formal training),aiacc,haveworkedframework,platformchoice,sohow,important_sameend,webframechoice,so_actions_3,newcollabtoolsdesirenextyear,databasehaveworkedwith,misctechworkedwith,timeanswering,languageswantentry,stackoverflowjoblisting,occupation,webdevelopertype,jobcontactpriorities3,ainextsomewhat similar,sotagshaveworkedwith,assessjobprojects,aisearchdevwanttoworkwith,techoppose_5,women_on_team,auditoryenvironment,aiagentchallengesstrongly agree,ethicschoice,onboarding,aichallenges,rep_range,learnedhiring,stackoverflowjobs,assessjobcommute,sotagsadmired,wantworkplatform,sovisitto,assessbenefits6,influenceinternet,vcinteraction,knowledge_1,unit_testing,professional,ethicsreport,techendorse_13_text,ainextno change,aidevwanttoworkwith,select all that apply (why try stack overflow careers: no spam),jobsatpoints_14,adsagreedisagree1,assessjob1,hypotheticaltools4,newedimpt,newdevops,frequency_1,excoderbalance,hypotheticaltools5,influencerecruitment,adspriorities5,stackoverflowjobsrecommend,importanthiringtitles,knowledge_3,"if your company has a native mobile app, what platforms do you support? (iphone)",frameworkdesirenextyear,officestacksyncadmired,aitoolcurrently mostly ai,stackoverflowfoundanswer,assessbenefits1,adblockerreasons,militaryus,so_dev_content,were you aware of the apptivate contest?,which languages are you proficient in? (java),otherpeoplescode,aicomplex,languagewanttoworkwith,did you participate in the apptivate contest?,where do you work remotely most of the time?,salary,techdoc,"in the last 12 months, how much money have you spent on personal technology-related purchases?",job_search_annoyance,aisearchhaveworkedwith,databasehaveentry,screenname,extraversion,sotagswant entry,techoppose_3,aiagentimpactneutral,assessbenefits2,aisearchwanttoworkwith,newonboardgood,aisearchdevhaveworkedwith,agree_legacy,aiagentextwrite,ainextvery different,mgrmoney,newsosites,q120,select up to 3 (most important aspect of new job opportunity: salary),techendorseintro,have you changed jobs in the last 12 months?,selftaughttypes,jobsatpoints_15,jobsatpoints_4,platformwantentry,agree_mars,timeafterbootcamp,influencehardware,aifuture,aitoolnot interested in using,ailearnhow,jobsatpoints_9,assessjob9,agentusesgeneral,so_actions_7,agree_problemsolving,university,assessbenefits10,platformdesirenextyear,opensourcer,jobseekingstatus,do you enjoy working remotely?,assessjobprofdevel,coderevhrs,interestedanswers,languagehaveworkedwith,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",soaccount,excoderreturn,adblocker,nondevelopertype,aiagentchallengesstrongly disagree,aiagent_uses,newpurplelink,aiagentexternal,"including yourself, how many developers are employed at your company?",jobcontactpriorities1,learncodechoose,aliens,techendorse_8,important_newtech,"in an average week, how do you spend your time? (developing new features)",yearscodedjobpast,so_actions_15,checkincode,aiagentobswrite,soduration,learningnewtech,ainextmore integrated,select all that apply (most urgent info about job opportunity: salary),agree_diversity,aiagentknowledge,tech_want,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",have you visited / are you aware of stack overflow careers?,tech_do,select all that apply (current lang & tech: android),opsyspersonal use,aiagentobservesecure,assessbenefits11,excoder10years,misctechdesirenextyear,have you visited / are you aware of stack overflow careers 2.0?,agree_alcohol,hypotheticaltools3,which of the following languages or technologies have you used significantly in the past year? (c),shipit,careersat,newjobhuntresearch,what is your involvement in purchasing? you can choose more than 1. (influencer),un_subregion,professionaltech,agree_loveboss,jobsatpoints_11,newcollabtoolsadmired,assessjobrole,jobemailpriorities6,misctechwanttoworkwith,agreedisagree1,aiagentimpactstrongly agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",how_to_improve_interview_process,dependents,convertedcompyearly,wantworkframework,select all that apply (why use stack overflow: help for job),ainextneither different nor similar,educationimportant,soai,techoppose_1,aiagentimpactsomewhat agree,jobsatpoints_5,officestackasyncwanttoworkwith,important_buildnew,itperson,techoppose_9,aimodelshaveworkedwith,timefullyproductive,webframewanttoworkwith,aiagentorchestration,airesponsible,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",stackoverflowmakemoney,adspriorities7,impsyn,star_wars_vs_star_trek,assessjobdept,lastint,techoppose_15,undergradmajor,jobsatpoints_6,changeworld,truefalse_2,newcollabtoolswanttoworkwith,assessbenefits9,check,excoderactive,truefalse_1,friendsdevelopers,devenvhaveentry,lasthiredate,techendorse_13,so_actions_9,aiinteresting,expectedsalary,aitoolplan to mostly use ai,newpurchaseresearch,seriouswork,do you have a stack overflow careers 2.0 profile?,jobsearchstatus,professionalquestion,pronouncegif,mentalhealth,knowledge_9,mgrwant,newrole,employmentaddl,assessjob3,ainextmuch more integrated,devenvsadmired,overpaid,frameworkworkedwith,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),wantworklanguage,rightwrongway,uk_country,experience_range,select up to 3 (appealing message traits: message is personalized),agreedisagree3,jobcontactpriorities4,techendorse_5,codingactivities,how do you use stack overflow? (read other people's questions to solve my problems),programming_ability,toolstechwanttoworkwith,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",hackathonreasons,toolcountwork,aitoolcurrently using,aiopen,questionsinteresting,sofriction,techoppose_11,so_actions_1,equipmentsatisfiedcpu,experience_midpoint,databaseadmired,aiagentimpactsomewhat disagree,investtimetools,mobiledevelopertype,workpaycare,techendorse_2,influenceworkstation,influencecommunication,ainextsomewhat different,jobsatpoints_15_text,so_actions_6,projectmanagement,educationparents,jobsatpoints_7,age_midpoint,toolcountpersonal,collector,stackoverflowmetachat,age1stcode,stackoverflowcopiedcode,newlearn,important_companymission,how many developers are employed at your company?,stackoverflowconsidermember,offon,entteams,platformhaveworkedwith,select all that apply (source control used: git),which technology products do you own? (you can choose more than one) (iphone),assessjob7,jobcontactpriorities2,opensource,blockchainis,workplan,educationtypes,aiagentchallengessomewhat disagree,stackoverflowwhatdo,ergonomicdevices,adsagreedisagree3,learncodecoursescert,hypotheticaltools2,stackoverflowhasaccount,aiagentknowwrite,importanthiringeducation,visit_frequency,select all that apply (future lang & tech: android),haveworkedlanguage,jobfactors,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",workloc,containers,ainextmuch less integrated,devenvwantentry,lastnewjob,jobcontactpriorities5,stackoverflowjobsearch,professionalcloud,mgridiot,buildvsbuy,opsysprofessional use,stackoverflowhelpful,resumeprompted,important_variety,majorundergrad,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),aitoolcurrently partially ai,officestackhaveentry,kinshipdevelopers,enjoydebugging,so_actions_15_text,stackoverflownewquestion,hourscomputer,select all that apply (why answer: help a programmer in need),influencedepttech,sovisitfreq,techendorse_9,importantbenefits,techendorse_4,importanthiringcommunication,student,techoppose_15_text,techendorse_6,programhobby,years_coding,education,employment,company_size,salary.1,job_satisfaction,job_title,developer_type,industry,country,programming_experience,database_worked_with,dev_environment,operating_sys,dev_methodology,communication_tools,gender,age,learning,work_experience,remote,team_size,survey_easy,version_control_sys,currency,hobby,race
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,California,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Start Up (1-25),,So happy it hurts,,,Web Services,United States of America,,,,Linux,,,,30-34,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$40,000 - $60,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,"$2,001-$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Start Up (1-25),,I enjoy going to work,,,Software Products,Other Europe,,,,Mac OS X,,,,40-50,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$20,000 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Mature Small Business (25-100),,So happy it hurts,,,Software Products,South America,,,,Windows 7,,,,30-34,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$60,000 - $80,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41435,,,Mid Sized (100-999),,FML,,,Healthcare,Other Asia,,,,Mac OS X,,,,30-34,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$20,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41310,,,"Other (not working, consultant, etc.)",,I enjoy going to work,,,Other,Other Europe,,,,Linux,,,,25-29,,,,,,,,,


In [18]:
df_all.to_csv('df_final_population.csv')

In [21]:
df_2011 = df_all[df_all['surveyyear'] == 2011]

df_2011['country'].value_counts()

country
United States of America    195
Other Europe                 92
United Kingdom               70
Canada                       35
Australia                    25
Germany                      20
India                        19
Other Asia                   18
Middle East                  18
France                       14
South America                12
Russia                       12
Netherlands                  11
Italy                         6
Africa                        5
Australasia                   3
Central America               3
Mexico                        3
North America (Other)         2
Name: count, dtype: int64

In [22]:
combined_consolidated_df = df_all.copy()

In [26]:
# Mapping of variants -> canonical names (lowercased keys for matching)
country_map = {
    'united states': 'United States',
    'united states of america': 'United States',
    'united kingdom of great britain and northern ireland': 'United Kingdom',
    'united kingdom': 'United Kingdom',
    'trinidad and tobago': 'Trinidad and Tobago',
    'trinidad & tobago': 'Trinidad and Tobago',
    'syrian arab republic': 'Syria',
    'syria': 'Syria',
    'other country (not listed above)': 'Other',
    'other (please specify)': 'Other',
    'myanmar, {burma}': 'Myanmar',
    'myanmar': 'Myanmar',
    'libyan arab jamahiriya': 'Libya',
    'libya': 'Libya',
    'laos': 'Laos',
    "lao people's democratic republic": 'Laos',
    'korea south': 'South Korea',
    'republic of korea': 'South Korea',
    'south korea': 'South Korea',
    'korea north': 'North Korea',
    'north korea': 'North Korea',
    'ireland': 'Ireland',
    'ireland {republic}': 'Ireland',
    'hong kong (s.a.r.)': 'Hong Kong',
    'hong kong': 'Hong Kong',
    'guinea-bissau': 'Guinea',
    'guinea': 'Guinea',
    'bosnia herzegovina': 'Bosnia and Herzegovina',
    'bosnia and herzegovina': 'Bosnia and Herzegovina',
    'bosnia-herzegovina': 'Bosnia and Herzegovina',
    'vatican city state': 'Vatican',
    'vatican': 'Vatican',
    'viet nam': 'Vietnam',
    'vietnam': 'Vietnam'
}

In [27]:
# Standardize and map 'country' values in combined_consolidated_df
def standardize_country(val):
    # Preserve NaN/None as-is
    if pd.isna(val):
        return val
    # Normalize to lower-case stripped string for lookup
    key = str(val).strip().lower()
    # Return mapped canonical name if available, otherwise return original (preserve original casing)
    return country_map.get(key, val)

# Apply mapping (overwrites 'country' column if present)
if 'country' in combined_consolidated_df.columns:
    combined_consolidated_df['country'] = combined_consolidated_df['country'].apply(standardize_country)
    print("Mapped 'country' values using country_map. Sample counts:")
    # Print the top values (including NaN) to give a quick check
    print(combined_consolidated_df['country'].value_counts(dropna=False).head(20))
else:
    print("Warning: 'country' column not found in combined_consolidated_df")

Mapped 'country' values using country_map. Sample counts:
country
United States         30699
India                 14533
Germany               10309
NaN                   10042
United Kingdom         9382
Canada                 5367
France                 4451
Brazil                 3344
Poland                 3312
Netherlands            3184
Australia              3017
Italy                  2742
Spain                  2682
Russian Federation     2564
Sweden                 2163
Ukraine                2019
Switzerland            1672
Israel                 1467
Austria                1443
Turkey                 1429
Name: count, dtype: int64


In [28]:
# Remove columns from combined_consolidated_df with >70% null values
threshold = 0.7
null_pct = combined_consolidated_df.isna().mean()
cols_to_drop = null_pct[null_pct > threshold].index.tolist()
print(f"Dropping {len(cols_to_drop)} columns with >70% null values:")
print(cols_to_drop)
combined_consolidated_df.drop(columns=cols_to_drop, inplace=True)
print(f"New shape of combined_consolidated_df: {combined_consolidated_df.shape}")

Dropping 639 columns with >70% null values:
['importanthiringrep', 'equipmentsatisfiedram', 'eduother', 'hoursperweek', 'learncodeai', 'jobsatpoints_16', 'webframewantentry', 'assessjobcompensation', 'techoppose_2', 'why_stack_overflow', 'languagedesirenextyear', 'ethicalimplications', 'knowledge_6', "aitooldon't plan to use ai for this task", 'purchasehow', 'dogs_vs_cats', 'agreedisagree2', 'equipmentsatisfiedmonitors', "please rate the advertising you've seen on stack overflow (the ads are relevant)", 'fizzbuzz', 'influenceviztools', 'stackoverflowmoderation', 'haveworkedplatform', 'interview_likelihood', 'equipmentsatisfiedrw', 'sovisit1st', 'job_discovery', 'what types of purchases are you involved in? (hardware)', 'databasedesirenextyear', 'workchallenge', 'icorpm', 'how likely is it that a recommendation you make will be acted upon?', 'webframeadmired', 'assessjobdiversity', 'knowledge_8', 'adspriorities2', 'aiagentchallengesneutral', 'aiselect', 'problemsolving', 'frustration', 

In [29]:
# Detailed column stats sorted by non-null count (descending)
col_stats = pd.DataFrame({
    'non_null_count': combined_consolidated_df.notna().sum(),
    'null_count': combined_consolidated_df.isna().sum(),
    'unique_count': combined_consolidated_df.nunique(dropna=True)
}).sort_values('non_null_count', ascending=False)


In [30]:
# Add percent of total rows for nulls (rounded to 2 decimals)
total_rows = combined_consolidated_df.shape[0]
col_stats['null_pct'] = (col_stats['null_count'] / total_rows * 100).round(2)

In [31]:
col_stats

Unnamed: 0,non_null_count,null_count,unique_count,null_pct
surveyyear,154521,0,15,0.0
country,144479,10042,229,6.5
employment,140846,13675,151,8.85
education,137278,17243,554,11.16
years_coding,128780,25741,194,16.66
age,125492,29029,175,18.79
programming_experience,116643,37878,164,24.51
developer_type,113405,41116,14745,26.61
company_size,108436,46085,38,29.82
survey_easy,107361,47160,18,30.52


In [32]:
df_final = combined_consolidated_df.copy()

In [33]:
df_final.to_csv('df_final.csv') 