# Data Wrangling Template

## Gather

In [2]:
import zipfile
import pandas as pd

In [3]:
with zipfile.ZipFile('armenian-online-job-postings.zip','r') as myzip:
    myzip.extractall()

In [4]:
# Read CSV (comma-separated) file into DataFrame
df = pd.read_csv('online-job-postings.csv')

## Assess

In [None]:
df

In [5]:
# Display a basic summary of the DataFrame using .info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost             19001 non-null object
date                19001 non-null object
Title               18973 non-null object
Company             18994 non-null object
AnnouncementCode    1208 non-null object
Term                7676 non-null object
Eligibility         4930 non-null object
Audience            640 non-null object
StartDate           9675 non-null object
Duration            10798 non-null object
Location            18969 non-null object
JobDescription      15110 non-null object
JobRequirment       16479 non-null object
RequiredQual        18517 non-null object
Salary              9623 non-null object
ApplicationP        18941 non-null object
OpeningDate         18295 non-null object
Deadline            18936 non-null object
Notes               2211 non-null object
AboutC              12470 non-null object
Attach              1559 non-null object
Year              

In [12]:
# Display the first five rows of the DataFrame using .head
df.head()


2012    2149
2015    2009
2013    2009
2014    1983
2008    1785
2011    1697
2007    1538
2010    1511
2009    1191
2005    1138
2006    1116
2004     875
Name: Year, dtype: int64

In [None]:
# Display the entry counts for the Year column using .value_counts
df['Year'].value_counts()

1) Missing Values (NaN)

2) StartDate Inconsistencies (ASAP)

3) Fix non descriptive column names (ApplicationP, AboutC, RequiredQual etc.)

## Clean

#### Define
- Select All records in the StartDate column that have "As soon as possible" , "Immediately" , etc and replace the text in those celss with "ASAP"

- Select all non descriptive and misspelled column headers (ApplicationP, RequiredQual, AboutC, JobRequirment) and replace them with full words (ApplicationProcedure, RequiredQualification, AboutCompany, JobRequirement)

#### Code

In [13]:
df_clean = df.copy()

•Select all non descriptive and misspelled column headers (ApplicationP, RequiredQual, AboutC, JobRequirment) and replace them with full words (ApplicationProcedure, RequiredQualification, AboutCompany, JobRequirements)

In [15]:
df_clean = df_clean.rename(columns={'ApplicationP': 'ApplicationProcedure',
                                    'RequiredQual': 'RequiredQualification',
                                    'AboutC':'AboutCompany',
                                    'JobRequirment':'JobRequirements'})

•Select All records in the StartDate column that have "As soon as possible" , "Immediately" , etc and replace the text in those celss with "ASAP"

In [20]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']

for phrase in asap_list:
    df_clean.StartDate.replace( phrase,"ASAP", inplace=True)
    

#### Test

In [21]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost                  19001 non-null object
date                     19001 non-null object
Title                    18973 non-null object
Company                  18994 non-null object
AnnouncementCode         1208 non-null object
Term                     7676 non-null object
Eligibility              4930 non-null object
Audience                 640 non-null object
StartDate                9675 non-null object
Duration                 10798 non-null object
Location                 18969 non-null object
JobDescription           15110 non-null object
JobRequirement           16479 non-null object
RequiredQualification    18517 non-null object
Salary                   9623 non-null object
ApplicationProcedure     18941 non-null object
OpeningDate              18295 non-null object
Deadline                 18936 non-null object
Notes                    2211 non-null object
AboutC

In [23]:
df_clean.StartDate.value_counts()

ASAP                                                                                                                        6856
01 September 2012                                                                                                             31
March 2006                                                                                                                    27
November 2006                                                                                                                 22
January 2010                                                                                                                  19
February 2014                                                                                                                 17
01 February 2005                                                                                                              17
TBD                                                                                              

In [24]:
for phrase in asap_list:
    assert phrase not in df_clean.StartDate.values