In [175]:
import pandas as pd
import numpy as np

## for language detection
import langdetect 

### Import spanish job postings

In [176]:
sp_glass_df = pd.read_csv(r'./Glassdoor_WebS/Glassdoor_data_info_sp.csv')
sp_indeed_df = pd.read_csv(r'./Indeed_WebS/Indeed_data_info_sp.csv')
sp_link_df = pd.read_csv(r'./LinkedIn_WebS/Linkedin_data_info_sp.csv')

In [177]:
sp_glass_df.columns

Index(['Unnamed: 0', 'Job Title', 'Company Name', 'Location', 'Date Scraped',
       'Job URL', 'Job Industry', 'Job Type', 'Job Seniority',
       'Job Description'],
      dtype='object')

In [178]:
sp_glass_df.head(2)

Unnamed: 0.1,Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description
0,1,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...
1,2,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...


In [179]:
(sp_glass_df.drop('Unnamed: 0',axis=1,inplace=True))
sp_glass_df.dropna(how='all',inplace=True)

In [180]:
sp_glass_df.head(2)

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description
0,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...
1,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...


In [181]:
sp_indeed_df.columns

Index(['row', 'Job Title', 'Company Name', 'Location', 'Date Scraped',
       'Job URL', 'Job Industry', 'Job Type', 'Job Seniority',
       'Job Description'],
      dtype='object')

In [182]:
sp_indeed_df.drop('row',axis=1,inplace=True)

In [183]:
sp_indeed_df.head(2)

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description
0,nueva oferta\nBusiness Analyst (m/f/d),TUI,"Arca, A Coruña provincia",7/26/2021,https://es.indeed.com/rc/clk?jk=4510d66ff04f01...,,,,For TUI Group IT we are looking for a Business...
1,nueva oferta\nAnalytics Analyst With English,Triangle,"Madrid, Madrid provincia",7/26/2021,https://es.indeed.com/rc/clk?jk=44e95198c04487...,,,,Role:\nAnalytics Analyst with English\nDescrip...


In [184]:
sp_link_df.columns

Index(['Job Title', 'Company Name', 'Location', 'Date Scraped', 'Job URL',
       'Job Industry', 'Job Type', 'Job Seniority', 'Job Description'],
      dtype='object')

In [185]:
sp_link_df.head(2)

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description
0,Programador/a Robótica (ESTABLE). Inglés.,Keyland Sistemas de Gestión,"Valladolid, Castilla and Leon, Spain",2021-07-26,https://es.linkedin.com/jobs/view/programador-...,Computer Software,Employment type\nFull-time,Mid-Senior level,"En KEYLAND nos dedicamos a la optimización, de..."
1,Business Intelligence Analyst,Carnovo,"Barcelona, Catalonia, Spain",2021-07-26,https://es.linkedin.com/jobs/view/business-int...,,,,At Carnovo we are continuing to revolutionize ...


## Let the EDA begin

How will we handle it? We have to make a few decisions:
1. Will we find the most required skills for all of the countries altogether? Or will we address each separately?
2. Will we use any of the columns to group our information or analyze the dataset as a whole?
3. How will we work on the long text data? 

1. As we will be deciding beforehand which country i'm moving to, I will address each country required skills separately
2. We will be looking for skills required for each position. It's likely that we will have to standirize the Job titles as different companies call the same positions different names
3. As we dive deeper into the data, we will be exploring different options.

### spstralian DFs

First, we will concatenate all the dfs to be analyze the country data as a whole

In [186]:
df_sp_concat = pd.concat([sp_glass_df,
sp_indeed_df,
sp_link_df])

In [187]:
df_sp_concat.head(2)

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description
0,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...
1,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...


In [188]:
len(sp_glass_df)

182

In [189]:
len(sp_indeed_df)

96

In [190]:
len(sp_link_df)

1529

In [191]:
len(df_sp_concat)

1807

To simplify our analysis, we will reset the index

In [192]:
df_sp_concat.reset_index(drop=True,inplace=True)

### Language detection

Just to make sure we have reliable information, we will be using a language detection tool

In [193]:
txt = df_sp_concat["Job Description"].iloc[0]
print(txt[0:40], " --> ", langdetect.detect(txt))

For TUI Group IT we are looking for a Bu  -->  en


In [194]:
df_sp_concat['lang'] = df_sp_concat.fillna('')['Job Description'].apply(lambda x: langdetect.detect(x) if str(x).strip() != "" else "")
df_sp_concat.head()

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description,lang
0,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...,en
1,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...,en
2,Analytics Analyst With English,Triangle,Madrid,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Full-time,,Role:\nAnalytics Analyst with English\nDescrip...,en
3,Senior Payroll Analyst - Blue-collars (They/Sh...,Glovo,Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Information Technology,Job Type : N/A,,About Glovo:\nWe're a Barcelona-based startup ...,en
4,Marketing Data Analyst (English),Blu Selection,Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : N/A,Job Type : Full-time,,Do you dedicate your career to providing the b...,en


In [195]:
df_sp_concat['lang'].value_counts()

en    1300
es     433
        58
pt       6
ca       5
fr       2
id       1
hu       1
de       1
Name: lang, dtype: int64

#### Let's take a look at the non-english/non-spanish results

In [196]:
df_sp_concat[~(df_sp_concat['lang']=='en')&~(df_sp_concat['lang']=='es')]['Job Description'].loc[38]

'Data analyst en Barcelona.\nQuina s la missi principal?\n\nDur a terme el desenvolupament de la plataforma Intel litur tant a nivell tecnolgic com de crrega de continguts a partir del que expressa el Pla de recerca de lACT i el Roadmap especfic de desplegament de la plataforma tecnolgic i el seu encaix dins lecosistema digital de lACT.\n\nEl desplegament correcte de la plataforma ha de permetre una transferncia de coneixement ptima per al sector turstic aix com afavorir la transformaci cap a la cultura de la dada del personal de la prpia ACT.\n\n"PROJECTE INTEL LITUR" dins del PLA DE MRQUETING 2018-2022\n\nFuncions principals:\nSeguiment del desplegament de la plataforma de dades amb lempresa tecnolgica desenvolupadora.\nVetllar per lencaix de la plataforma a lextranet i al CRM de lACT.\nPlantejar els dashboards de sortida de dades de dades massives dins de la plataforma aix com la llibreria virtual i lespai de transferncia de continguts.\nCoordinar amb la tcnica de continguts el pla 

The results where lang is empty are the ones where the job description field is empty, which makes sense. We should consider either completing the fields or dropping them as we won't be able to analyze the most fetched skills from them. Same thing occurs with Job Descriptions in languages that are not english or spanish.

In [197]:
df_sp_concat.drop(df_sp_concat[(df_sp_concat['lang']!='en')&(df_sp_concat['lang']!='es')]['lang'].index,inplace=True)

In [198]:
len(df_sp_concat)

1733

#### Checking the job title distribution

In [199]:
round(df_sp_concat['Job Title'].value_counts()/len(df_sp_concat)*100,2)[0:10]

Data Engineer                                              2.54
Software Engineer                                          2.19
Business Analyst                                           2.02
Data Analyst                                               1.85
Data Scientist – Remote, Full-time                         1.73
[REMOTE] Swift Developer – Full-time                       1.50
Data Scientist                                             1.44
ET_BO_12961_Process Business Analyst Lead                  0.98
Senior Analyst - Consulting                                0.98
Software Engineer, Trilogy (Remote) - $200,000/year USD    0.98
Name: Job Title, dtype: float64

The prevalent job title is Data Engineer in this case.

In [200]:
# Saving the rows that comply with the condition set
df_business_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Business').fillna(False)]

In [201]:
df_analyst_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Analyst').fillna(False)]

In [202]:
df_analista_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Analista').fillna(False)]

In [203]:
df_analytics_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Analytics').fillna(False)]

In [204]:
df_tableau_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Tableau').fillna(False)]

In [205]:
df_business_i_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Business Intelligence').fillna(False)]

In [206]:
df_bi_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('BI').fillna(False)]

In [207]:
df_risk_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Risk').fillna(False)]

In [208]:
df_riesgo_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Riesgo').fillna(False)]

In [209]:
df_customer_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Customer').fillna(False)]

In [210]:
df_cliente_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Cliente').fillna(False)]

In [211]:
df_product_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Product').fillna(False)]

In [212]:
df_producto_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Producto').fillna(False)]

In [213]:
df_insights_sp = df_sp_concat[df_sp_concat['Job Title'].str.contains('Insights').fillna(False)]

In [214]:
df_full_concat_sp = pd.concat([df_business_sp,df_analyst_sp,df_analista_sp,df_data_analytics_sp,df_tablesp_sp,df_business_i_sp,df_bi_sp,df_risk_sp,df_riesgo_sp,df_customer_sp,df_cliente_sp,df_product_sp,df_producto_sp,df_insights_sp])

In [215]:
df_full_concat_sp.drop_duplicates(inplace=True)

In [216]:
len(df_full_concat_sp)

839

We kept almost half the rows that meet at least 1 of the conditions above.

In [217]:
df_full_concat_sp.head()

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description,lang
0,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...,en
1,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...,en
8,"Business Analyst, Master Data Management",IQVIA,Madrid,7/27/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Biotech & Pharmaceuticals,Job Type : Full-time,,External Job Description\nRole Overview\nThe I...,en
22,"Business Analyst, Master Data Management",IQVIA,Madrid,2021-07-28,https://www.glassdoor.com/partner/jobListing.h...,Industry : Biotech & Pharmaceuticals,Job Type : Full-time,,External Job Description\nRole Overview\nThe I...,en
42,Business Analyst - Zimit,Alight,Granada,2021-07-29,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Full-time,,"Other Permanent Granada (GRA1), Spain Posted o...",en


### Text analysis

As we won't be modelling, we will be looking for some commonly asked for skills and we will measure how much they are mentioned in different job postings. For this, we will be using the str.contains method as used above. 

##### Skills that we will be looking for
Below we will list the skills that will be looking for in the descriptions in order to understand which ones are fetched the most:
- Data Visualization (data viz - Visualizing Data - Reporting)
- Power BI (PowerBI)
- Tablesp
- Looker
- Qlik
- Google Data Studio (GDS)
- Data Cleaning (Data prep - Data preparation)
- Programming
- MATLAB
- R
- Python
- SAS
- SQL 
- NoSQL
- Machine Learning (ML)
- Microsoft Excel
- ETL
- Azure
- AWS
- Google Cloud Platform (GCP)
- Oracle
- Domain (industry)

#### Creamos una lista de resultados en ingles para buscar palabras en ingles

In [218]:
skills_list_en = ['data visualization','data viz','visualizing data','reporting','power bi','powerbi',\
'tableau','looker','qlik','google data studio', 'gds', 'data cleaning', 'data prep','data preparation'\
,'programming', 'matlab', ' r ', 'python', 'sas','sql', 'nosql','machine learning','artificial','intelligence',' ml ','microsoft excel'\
,'excel','etl','azure','aws','google cloud platform','gcp','google','oracle','domain','industry','apache','spark'\
,'dataproc','databricks','airflow','dbt','pipeline','scalding','hadoop','pig']

#### Creamos una lista de resultados en español para buscar palabras en español

In [219]:
skills_list_sp = ['visualización','data viz','visualizar','reporteria','reportes','power bi','powerbi',\
'tableau','looker','qlik','google data studio', 'gds', 'limpieza', 'data prep','preparacion','preparación'\
,'programming', 'matlab', ' r ', 'python', 'sas','sql', 'nosql','machine learning','inteligencia',' ml ','microsoft excel'\
,'excel','etl','azure','aws','google cloud platform','gcp','google','oracle','dominio','industria','apache','spark'\
,'dataproc','databricks','airflow','dbt','pipeline','scalding','hadoop','pig']

#### Concatenamos las listas

In [220]:
skills_list_sp_en = skills_list_en + skills_list_sp

In [221]:
skills_list_sp_en

['data visualization',
 'data viz',
 'visualizing data',
 'reporting',
 'power bi',
 'powerbi',
 'tableau',
 'looker',
 'qlik',
 'google data studio',
 'gds',
 'data cleaning',
 'data prep',
 'data preparation',
 'programming',
 'matlab',
 ' r ',
 'python',
 'sas',
 'sql',
 'nosql',
 'machine learning',
 'artificial',
 'intelligence',
 ' ml ',
 'microsoft excel',
 'excel',
 'etl',
 'azure',
 'aws',
 'google cloud platform',
 'gcp',
 'google',
 'oracle',
 'domain',
 'industry',
 'apache',
 'spark',
 'dataproc',
 'databricks',
 'airflow',
 'dbt',
 'pipeline',
 'scalding',
 'hadoop',
 'pig',
 'visualización',
 'data viz',
 'visualizar',
 'reporteria',
 'reportes',
 'power bi',
 'powerbi',
 'tableau',
 'looker',
 'qlik',
 'google data studio',
 'gds',
 'limpieza',
 'data prep',
 'preparacion',
 'preparación',
 'programming',
 'matlab',
 ' r ',
 'python',
 'sas',
 'sql',
 'nosql',
 'machine learning',
 'inteligencia',
 ' ml ',
 'microsoft excel',
 'excel',
 'etl',
 'azure',
 'aws',
 'google

In [222]:
df_full_concat_sp.head()

Unnamed: 0,Job Title,Company Name,Location,Date Scraped,Job URL,Job Industry,Job Type,Job Seniority,Job Description,lang
0,Business Analyst (m/f/d),TUI,Arca,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Travel & Tourism,Job Type : N/A,,For TUI Group IT we are looking for a Business...,en
1,Business Analyst | Delta Partners,"FTI Consulting, Inc.",Spain,7/26/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Internship,,Brief Description\n\nThis job is for you if yo...,en
8,"Business Analyst, Master Data Management",IQVIA,Madrid,7/27/2021,https://www.glassdoor.com/partner/jobListing.h...,Industry : Biotech & Pharmaceuticals,Job Type : Full-time,,External Job Description\nRole Overview\nThe I...,en
22,"Business Analyst, Master Data Management",IQVIA,Madrid,2021-07-28,https://www.glassdoor.com/partner/jobListing.h...,Industry : Biotech & Pharmaceuticals,Job Type : Full-time,,External Job Description\nRole Overview\nThe I...,en
42,Business Analyst - Zimit,Alight,Granada,2021-07-29,https://www.glassdoor.com/partner/jobListing.h...,Industry : Business Services,Job Type : Full-time,,"Other Permanent Granada (GRA1), Spain Posted o...",en


### Looking for the terms on the Job Descriptions

In [223]:
skills_count_dict = {}

In [224]:
for i in skills_list_sp_en:
    print(i,' was found in ',len(df_full_concat_sp[df_full_concat_sp['Job Description'].str.lower().str.contains(i).fillna(False)]),' job postings')
    skills_count_dict[i]=len(df_full_concat_sp[df_full_concat_sp['Job Description'].str.lower().str.contains(i).fillna(False)])

data visualization  was found in  27  job postings
data viz  was found in  1  job postings
visualizing data  was found in  1  job postings
reporting  was found in  123  job postings
power bi  was found in  40  job postings
powerbi  was found in  16  job postings
tableau  was found in  50  job postings
looker  was found in  15  job postings
qlik  was found in  17  job postings
google data studio  was found in  4  job postings
gds  was found in  0  job postings
data cleaning  was found in  0  job postings
data prep  was found in  1  job postings
data preparation  was found in  1  job postings
programming  was found in  55  job postings
matlab  was found in  1  job postings
 r   was found in  12  job postings
python  was found in  55  job postings
sas  was found in  72  job postings
sql  was found in  104  job postings
nosql  was found in  9  job postings
machine learning  was found in  18  job postings
artificial  was found in  8  job postings
intelligence  was found in  58  job postings

In [225]:
# Let's take a look at our dict
skills_count_dict

{'data visualization': 27,
 'data viz': 1,
 'visualizing data': 1,
 'reporting': 123,
 'power bi': 40,
 'powerbi': 16,
 'tableau': 50,
 'looker': 15,
 'qlik': 17,
 'google data studio': 4,
 'gds': 0,
 'data cleaning': 0,
 'data prep': 1,
 'data preparation': 1,
 'programming': 55,
 'matlab': 1,
 ' r ': 12,
 'python': 55,
 'sas': 72,
 'sql': 104,
 'nosql': 9,
 'machine learning': 18,
 'artificial': 8,
 'intelligence': 58,
 ' ml ': 4,
 'microsoft excel': 5,
 'excel': 259,
 'etl': 23,
 'azure': 21,
 'aws': 34,
 'google cloud platform': 0,
 'gcp': 0,
 'google': 39,
 'oracle': 5,
 'domain': 29,
 'industry': 157,
 'apache': 0,
 'spark': 12,
 'dataproc': 0,
 'databricks': 6,
 'airflow': 1,
 'dbt': 2,
 'pipeline': 17,
 'scalding': 0,
 'hadoop': 4,
 'pig': 8,
 'visualización': 4,
 'visualizar': 2,
 'reporteria': 0,
 'reportes': 4,
 'limpieza': 4,
 'preparacion': 0,
 'preparación': 0,
 'inteligencia': 1,
 'dominio': 3,
 'industria': 18}

In [226]:
skills_df_sp = pd.DataFrame(skills_count_dict.items(),columns=['skill','count'])

In [227]:
skills_df_sp.head()

Unnamed: 0,skill,count
0,data visualization,27
1,data viz,1
2,visualizing data,1
3,reporting,123
4,power bi,40


In [228]:
skills_df_sp

Unnamed: 0,skill,count
0,data visualization,27
1,data viz,1
2,visualizing data,1
3,reporting,123
4,power bi,40
5,powerbi,16
6,tableau,50
7,looker,15
8,qlik,17
9,google data studio,4


In [229]:
df_full_concat_sp['lang'].value_counts()

en    699
es    140
Name: lang, dtype: int64

In [230]:
len(df_full_concat_sp[df_full_concat_sp['lang']=='es'])

140

In [231]:
skills_df_sp['mentions_percent']=pd.concat([round(skills_df_sp['count'][:46]/len(df_full_concat_sp)*100,2),round(skills_df_sp['count'][46:]/len(df_full_concat_sp[df_full_concat_sp['lang']=='es'])*100,2)])

In [234]:
skills_df_sp=skills_df_sp.sort_values('mentions_percent',ascending= False)

## Exporting the results

In [235]:
#skills_df_sp.to_csv(r'C:\Users\Gonzalo\Documents\DH\Contenido\ds_blend_students_2020\Proyectos Propios\By Industry\0 - Jobs\Data Analytics\Results - CSV\results_sp.csv')