In [1]:
#constants, paths
DATA_PATH='../data/raw/survey_results_public.csv'
EXPORT_PATH='../data/processed/1_preprocessed_df.pkl'

REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}

In [2]:
# Load packages
import numpy as np
import pandas as pd

In [3]:
#read data
raw_df = pd.read_csv(DATA_PATH)

----------------

# Functions

In [4]:
def Split_answer(column, delimiter=';'):
    """ 
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 

    Parameters:
    
    * column : column name that has string series with answers 
    * delimiter (string): Another decimal integer 
                          Defaults to ";"

    """
    # 1. check if the column contains delimeters
    if (df[column].str.contains(';').any()):
        #2. split  if it contain multiple answers
        df[column]=df[column].str.split(';')
        
        #Handle null values: Replace NAs with empty lists 
        nulls=df[column].isnull()
        df[column].loc[nulls]=df[column].loc[nulls].apply(lambda x:[])
        
        print(df[column]) 
       

# Preprocess

In [5]:
df=raw_df.copy()

## 1. Handle  irrelavant data types

In [6]:
for column, replacment in REPLACE_DICT.items():
    df[column]= df[column].replace(replacment).astype(np.float32)
    print('-----------------------------------------------------------------------')
    print(column,'\n')
    print(df[column].unique())

-----------------------------------------------------------------------
YearsCodePro 

[nan 10.  4.  5.  6.  2. 30.  9. 18. 12. 21.  1. 16.  0. 15.  3. 35.  7.
  8. 17. 14. 26. 25. 20. 50. 34. 11. 24. 22. 13. 31. 23. 39. 41. 27. 28.
 19. 33. 51. 37. 29. 32. 43. 40. 38. 45. 42. 46. 36. 44. 47. 48. 49.]
-----------------------------------------------------------------------
YearsCode 

[nan  7. 17.  3.  4.  6. 16. 12. 15. 10. 40.  9. 26. 14. 39. 20.  8. 19.
  5.  0. 22.  2.  1. 34. 21. 13. 25. 24. 30. 31. 18. 38. 51. 27. 41. 42.
 35. 23. 28. 11. 37. 44. 43. 36. 33. 45. 29. 50. 46. 32. 47. 49. 48.]


## 2. Split Answers

In [7]:
for column in df.columns:
    # first check the type of the column
    if df[column].dtype == 'O':
        Split_answer(column, delimiter=';')
        


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0        [Coding Bootcamp, Other online resources (ex: ...
1        [Other online resources (ex: videos, blogs, et...
2        [Other online resources (ex: videos, blogs, et...
3                                                       []
4                                [Friend or family member]
                               ...                        
83434    [Other online resources (ex: videos, blogs, et...
83435    [Other online resources (ex: videos, blogs, et...
83436                                             [School]
83437    [Online Courses or Certification, Books / Phys...
83438                                             [School]
Name: LearnCode, Length: 83439, dtype: object
0                                      [Developer, mobile]
1                                                       []
2                                                       []
3                                   [Developer, front-end]
4        [Developer, desktop or enterprise applications...
          

0                                        [PHPStorm, Xcode]
1           [Android Studio, IntelliJ, Notepad++, PyCharm]
2        [IPython/Jupyter, PyCharm, RStudio, Sublime Te...
3                                                       []
4         [Atom, IPython/Jupyter, Notepad++, PyCharm, Vim]
                               ...                        
83434    [IntelliJ, Sublime Text, Vim, Visual Studio Code]
83435    [Android Studio, Eclipse, Emacs, IntelliJ, Net...
83436    [Android Studio, Eclipse, IntelliJ, IPython/Ju...
83437                              [PyCharm, Sublime Text]
83438                       [IntelliJ, Visual Studio Code]
Name: NEWCollabToolsHaveWorkedWith, Length: 83439, dtype: object
0                                            [Atom, Xcode]
1                                                       []
2        [IPython/Jupyter, RStudio, Sublime Text, Visua...
3                                                       []
4        [Atom, IPython/Jupyter, Notepad++, PyChar

In [8]:
# another way to know object column
object_cols = df.select_dtypes(include='object').columns.tolist()
# object_cols

----------------

# Visually verify results

In [9]:
#check sample
df.sample(1).iloc[0]

ResponseId                                                                  41337
MainBranch                                 I am a student who is learning to code
Employment                                     Not employed, but looking for work
Country                                                                 Sri Lanka
US_State                                                                      NaN
UK_Country                                                                    NaN
EdLevel                         Secondary school (e.g. American high school, G...
Age1stCode                                                          11 - 17 years
LearnCode                       [Other online resources (ex: videos, blogs, et...
YearsCode                                                                     6.0
YearsCodePro                                                                  NaN
DevType                                          [Developer, full-stack, Student]
OrgSize         

In [10]:
i=df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

Assembly;Bash/Shell;C;C++;HTML/CSS;PowerShell;SQL;VBA
['Assembly', 'Bash/Shell', 'C', 'C++', 'HTML/CSS', 'PowerShell', 'SQL', 'VBA']


In [11]:
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

Developer, front-end;Developer, back-end;Data or business analyst
['Developer, front-end', 'Developer, back-end', 'Data or business analyst']


In [12]:
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

23
23.0


---

# Export Data

In [14]:
df.to_pickle(EXPORT_PATH)

----------------