# Word2Vec and LDA
https://www.kaggle.com/code/jl18pg052/word-embedding-word2vec-topic-modelling-lda

### Extracting informations from Text using Text Mining Techniques

Import Libraries

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re, nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from wordcloud import WordCloud
import spacy
nlp = spacy.load('en_core_web_sm')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.colors as mcolors
from collections import Counter
from matplotlib.ticker import FuncFormatter
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
nltk.download('punkt')
import ast

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Import `module_details_labelled.csv` Dataset

In [65]:
data = pd.read_csv("module_details_labelled.csv")
data.head()

Unnamed: 0,School,Major,Course_Code,Course_Name,Course_Description,Prereq,Category
0,NUS,Data Science and Analytics,DSA1101,Introduction to Data Science,The abundance of data being harvested from var...,"{'or': ['MA1301:D', 'MA1301FC:D', 'MA1301X:D']}",Core
1,NUS,Data Science and Analytics,CS2040,Data Structures and Algorithms,This course introduces students to the design ...,"{'or': ['CS1010:D', 'CS1010E:D', 'CS1010X:D', ...",Core
2,NUS,Data Science and Analytics,DSA2101,Essential Data Analytics Tools: Data Visualisa...,Data visualisation is an essential tool for da...,"{'and': [{'or': ['BT1101:D', 'DSA1101:D', 'DSE...",Core
3,NUS,Data Science and Analytics,DSA2102,Essential Data Analytics Tools: Numerical Comp...,This course aims at introducing basic concepts...,"{'and': [{'or': ['MA1101R:D', 'MA1508E:D', 'MA...",Core
4,NUS,Data Science and Analytics,MA2001,Linear Algebra I,This course is a first course in linear algebr...,"{'or': ['MA1301:D', 'MA1301FC:D', 'MA1301X:D']}",Core


#### Dataset Description

#### Getting Relevant modules (not part of core curriculum and GE)

In [66]:
# get only core modules (now includes electives)
core1 = ['Core', 'Elective']
data_core = data.loc[data['Category'].isin(core1)]

### Text Cleaning

Now let's start the text cleaning process.

In [67]:
def cleaned_text(text):
    clean = re.sub("\n"," ",text) # removes line breaks and newlines
    clean=clean.lower() # converts into lowercase
    clean=re.sub(r"[~.,%/:;?_&+*=!-]"," ",clean) # removes punctuations
    clean=re.sub("[^a-z]"," ",clean) # removes non-alphabetical char
    clean=clean.lstrip() # removes leading whitespace
    clean=re.sub("\s{2,}"," ",clean) #s single spaces throughout
    return clean
data_core["cleaned_descriptions"]=data_core["Course_Description"].apply(cleaned_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["cleaned_descriptions"]=data_core["Course_Description"].apply(cleaned_text)


Joins words into sentences

In [68]:
data_core["cleaned_descriptions"] = data_core["cleaned_descriptions"].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["cleaned_descriptions"] = data_core["cleaned_descriptions"].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3]))


In [69]:
data_core["cleaned_descriptions"].head(10)

0    abundance data being harvested from various se...
1    this course introduces students design impleme...
2    data visualisation essential tool data analyti...
3    this course aims introducing basic concepts we...
4    this course first course linear algebra fundam...
5    this course single variable calculus will intr...
6    this module applies advanced calculus practica...
7    this course calculus functions several real va...
8    this course gives elementary introduction prob...
9    this course introduces students theoretical un...
Name: cleaned_descriptions, dtype: object

Tokenise words before lemmatising

In [70]:
data_core["tokenized"]=data_core["cleaned_descriptions"].apply(lambda x: nltk.word_tokenize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["tokenized"]=data_core["cleaned_descriptions"].apply(lambda x: nltk.word_tokenize(x))


In [71]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i,pos='v') for i in text]
    return lem_text
data_core["lemmatized"]=data_core["tokenized"].apply(lambda x: word_lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["lemmatized"]=data_core["tokenized"].apply(lambda x: word_lemmatizer(x))


In [72]:
# joins lemmatized words into sentences
data_core["lemmatize_joined"]=data_core["lemmatized"].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["lemmatize_joined"]=data_core["lemmatized"].apply(lambda x: ' '.join(x))


In [73]:
stop = stopwords.words('english')
words_to_stop = ["also", "students", "course", "introduce", "content", \
                 "include", "introducing", "used","weeks", "allow", "knowledge", "concisely", "page", "harvest", "skills",\
                 "basic","use","task","state","introduction", \
                 "design","techniques","concepts","theory","application","process","understand","analytics", \
                 "develop","apply","relate","value","cover","simple","must","will","course","courses"]
stop.extend(words_to_stop)

# lemmatise our stop words
# stop_df = pd.Series(stop)
# stop_df = stop_df.apply(lambda x: word_lemmatizer([x])[0])
# stop = list(stop_df)

data_core["stop_removed_descriptions"]=data_core["lemmatize_joined"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_core['stop_removed_descriptions'] = data_core['stop_removed_descriptions'].str.replace(r'\bML\b', 'machine learning', regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core["stop_removed_descriptions"]=data_core["lemmatize_joined"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core['stop_removed_descriptions'] = data_core['stop_removed_descriptions'].str.replace(r'\bML\b', 'machine learning', regex=True)


In [74]:
data_core['Number_of_words_for_cleaned'] = data_core['stop_removed_descriptions'].apply(lambda x:len(str(x).split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_core['Number_of_words_for_cleaned'] = data_core['stop_removed_descriptions'].apply(lambda x:len(str(x).split()))


### Topic Modelling using LDA

The input will be in the form of document-term matrix, and we will convert that using the below piece of code.

In [75]:
lemmatized_stuff = data_core["lemmatized"] # when not removing stopwords

In [76]:
dictionary = corpora.Dictionary(lemmatized_stuff)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in lemmatized_stuff]

Running models to determine optimal number of topics.

In [77]:
LDA = gensim.models.ldamodel.LdaModel

### Using optimal number of topics = 4
After trial and error, we decided on 4 as the number of optimal topics.

In [78]:
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=4, random_state=100,
                chunksize=200, passes=100, minimum_probability = 0)
lda_model.print_topics()
coherence_model_lda = CoherenceModel(model=lda_model,
texts=lemmatized_stuff, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.38857452604842413


### Dominant topic for each Document

Finalised Topics

In [79]:
# Topic Labelling
topic_labels = {
    0: 'Project Management',
    1: 'Algorithms and Numerical Methods',
    2: 'Machine Learning',
    3: 'Math and Statistics'
}

In [80]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=lemmatized_stuff)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([topic_labels[topic_num], round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([top

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,Algorithms and Numerical Methods,0.6706,"data, course, this, students, will, program, p...","[abundance, data, be, harvest, from, various, ..."
1,1,Algorithms and Numerical Methods,0.976,"data, course, this, students, will, program, p...","[this, course, introduce, students, design, im..."
2,2,Algorithms and Numerical Methods,0.9867,"data, course, this, students, will, program, p...","[data, visualisation, essential, tool, data, a..."
3,3,Algorithms and Numerical Methods,0.8799,"data, course, this, students, will, program, p...","[this, course, aim, introduce, basic, concepts..."
4,4,Algorithms and Numerical Methods,0.9888,"data, course, this, students, will, program, p...","[this, course, first, course, linear, algebra,..."
5,5,Math and Statistics,0.9895,"model, course, this, students, calculus, topic...","[this, course, single, variable, calculus, wil..."
6,6,Math and Statistics,0.984,"model, course, this, students, calculus, topic...","[this, module, apply, advance, calculus, pract..."
7,7,Math and Statistics,0.9912,"model, course, this, students, calculus, topic...","[this, course, calculus, function, several, re..."
8,8,Math and Statistics,0.9893,"model, course, this, students, calculus, topic...","[this, course, give, elementary, introduction,..."
9,9,Math and Statistics,0.9871,"model, course, this, students, calculus, topic...","[this, course, introduce, students, theoretica..."


### Determine Topic of a New Document
### input: string;
### output: strings of dominant_topic, topic_keywords, topic_distribution


In [81]:
def preprocess_new_document(document):
    # Step 1: Remove words with length less than or equal to 3
    cleaned_text_document = cleaned_text(document)
    remove_function = lambda x: ' '.join([word for word in x.split() if len(word)>3])
    cleaned_removed_short_words_document = remove_function(cleaned_text_document)

    # Step 2: Tokenize the cleaned text
    tokenized_text_document = nltk.word_tokenize(cleaned_removed_short_words_document)

    # Step 3: Lemmatize the tokens
    lemmatized_text_document =  word_lemmatizer(tokenized_text_document)

    # # Step 4: Remove stopwords
    # stop_removed_text = ' '.join([word for word in lemmatized_text_document if word not in stop])
    # stop_removed_text = re.sub(r'\bML\b', 'machine learning', stop_removed_text)
    # return stop_removed_text

    # Step 4: Joining words together
    joined_text_document = ' '.join([word for word in lemmatized_text_document])

    return joined_text_document

# Example usage of the function
new_document = "This course teaches neural networks, and popular models such as Random Forest and XGBoost."
preprocessed_document = preprocess_new_document(new_document)
print(preprocessed_document)


this course teach neural network popular model such random forest xgboost


In [82]:
def get_document_topic(ldamodel, doc_term_matrix):
    topic_labels = {
    0: 'Project Management',
    1: 'Algorithms and Numerical Methods',
    2: 'Machine Learning',
    3: 'Math and Statistics'
}

    topic_scores = ldamodel.get_document_topics(doc_term_matrix)

    dominant_topic = max(topic_scores, key=lambda x: x[1])[0]
    topic_keywords = ", ".join([word for word, prop in ldamodel.show_topic(dominant_topic)])

    # relabel topic
    labelled_dominant_topic = topic_labels[dominant_topic]
    return labelled_dominant_topic, topic_keywords

def assign_cluster(new_document, ldamodel, dictionary):
    # Preprocess the new document
    preprocessed_doc = preprocess_new_document(new_document)

    # Convert the preprocessed document to a list of tokens
    tokens = preprocessed_doc.split()  # Split the string into tokens

    # Convert the tokens to a bag-of-words vector using the dictionary
    new_bow = dictionary.doc2bow(tokens)

    # Get the dominant topic and keywords for the new document
    dominant_topic, topic_keywords = get_document_topic(ldamodel, new_bow)

    # Topic distribution
    topic_distribution = lda_model.get_document_topics(new_bow)

    # Relabel the topic distribution
    topic_labels = {
    0: 'Project Management',
    1: 'Algorithms and Numerical Methods',
    2: 'Machine Learning',
    3: 'Math and Statistics'
}

    result = []
    for i in range(len(topic_distribution)):
        topic_id, probability = topic_distribution[i]
        topic_label = topic_labels.get(topic_id)
        result.append((topic_label, probability))

    return dominant_topic, topic_keywords, result

    # result here is the topic distribution
    # e.g. Topic Distribution: [('Project Management', 0.3359872), ('Algorithms and Numerical Methods', 0.19626443), ('Machine Learning', 0.4144256), ('Math and Statistics', 0.053322762)]

### Let user put in their own document to the LDA model
input: string
output: dictionary, formatting up to approver

In [83]:
def cluster_doc(document):
  dominant_topic, topic_keywords, topic_distribution = assign_cluster(document, lda_model, dictionary)
  dict = {}
  for i in topic_distribution:
    dict[i[0]] = round(i[1]*100,1)

  return dict

### Topic Distribution for each course
### input: dataframe column
### output: adds dataframe column (no output, it will just modify the existing dataframe u fit in; you can save the csv then query from there)

In [86]:
course_data = pd.read_csv("module_details_labelled.csv")

In [87]:
def topic_distribution_for_each_course(course_data,description):
  course_data["cluster_assigned"]= course_data[description].apply(lambda x: assign_cluster(x, lda_model, dictionary))
  course_data["dominant_topic"] = course_data["cluster_assigned"].apply(lambda x: x[0])

  course_data["topic_keywords"] = course_data["cluster_assigned"].apply(lambda x: x[1])

  course_data["topic_distribution"] = course_data["cluster_assigned"].apply(lambda x: x[2])

In [88]:
topic_distribution_for_each_course(course_data,"Course_Description")

In [89]:
course_data.head()

Unnamed: 0,School,Major,Course_Code,Course_Name,Course_Description,Prereq,Category,cluster_assigned,dominant_topic,topic_keywords,topic_distribution
0,NUS,Data Science and Analytics,DSA1101,Introduction to Data Science,The abundance of data being harvested from var...,"{'or': ['MA1301:D', 'MA1301FC:D', 'MA1301X:D']}",Core,"(Algorithms and Numerical Methods, data, cours...",Algorithms and Numerical Methods,"data, course, this, students, will, program, p...","[(Project Management, 0.0033871268), (Algorith..."
1,NUS,Data Science and Analytics,CS2040,Data Structures and Algorithms,This course introduces students to the design ...,"{'or': ['CS1010:D', 'CS1010E:D', 'CS1010X:D', ...",Core,"(Algorithms and Numerical Methods, data, cours...",Algorithms and Numerical Methods,"data, course, this, students, will, program, p...","[(Project Management, 0.007928742), (Algorithm..."
2,NUS,Data Science and Analytics,DSA2101,Essential Data Analytics Tools: Data Visualisa...,Data visualisation is an essential tool for da...,"{'and': [{'or': ['BT1101:D', 'DSA1101:D', 'DSE...",Core,"(Algorithms and Numerical Methods, data, cours...",Algorithms and Numerical Methods,"data, course, this, students, will, program, p...","[(Project Management, 0.0044416343), (Algorith..."
3,NUS,Data Science and Analytics,DSA2102,Essential Data Analytics Tools: Numerical Comp...,This course aims at introducing basic concepts...,"{'and': [{'or': ['MA1101R:D', 'MA1508E:D', 'MA...",Core,"(Algorithms and Numerical Methods, data, cours...",Algorithms and Numerical Methods,"data, course, this, students, will, program, p...","[(Project Management, 0.0040227626), (Algorith..."
4,NUS,Data Science and Analytics,MA2001,Linear Algebra I,This course is a first course in linear algebr...,"{'or': ['MA1301:D', 'MA1301FC:D', 'MA1301X:D']}",Core,"(Algorithms and Numerical Methods, data, cours...",Algorithms and Numerical Methods,"data, course, this, students, will, program, p...","[(Project Management, 0.0036828266), (Algorith..."


In [90]:
course_data.to_csv("lda_topic_distribution_for_modules_final.csv") # can save the csv then query the result u want from here

### Define function that segregate topics and their proportions in each module (course) or job

In [91]:
def segregate(df, i):
  topic_distribution = df.loc[i, 'topic_distribution']
  topic_distribution = ast.literal_eval(topic_distribution) # to convert string to a list of tuples
  for topic, prop in topic_distribution:
    df.loc[i, topic] = prop

### Calculating our Results for Different Schools' Majors
### input: our data (dataframe)
### output: dataframe with all the averages

In [92]:
modules_df = pd.read_csv("lda_topic_distribution_for_modules_final.csv")

In [93]:
def average_topic_distribution_for_majors(modules_df):
  new_modules_df = modules_df.loc[modules_df['Category'].isin(['Core', 'Elective'])].copy() # create new df for modules
  new_modules_df = new_modules_df.reset_index(drop=True) # resets index for new new_modules_df
  new_modules_df["Algorithms and Numerical Methods"], new_modules_df['Machine Learning'], new_modules_df['Project Management'], new_modules_df['Math and Statistics'], = 0, 0, 0, 0
  new_modules_df = new_modules_df.astype({'Algorithms and Numerical Methods':'float',
                                          'Machine Learning':'float',
                                          'Project Management':'float',
                                          'Math and Statistics':'float'})

  for i in range(len(new_modules_df)):
    segregate(new_modules_df, i) # segregate topics and proportion in each module/course

  td_majors = round((new_modules_df.groupby(['School', 'Major'])['Algorithms and Numerical Methods',
                                    'Machine Learning',
                                    'Project Management',
                                    'Math and Statistics'].mean())*100, 1)
  return td_majors

In [94]:
topic_distribution_majors = average_topic_distribution_for_majors(modules_df)

  td_majors = round((new_modules_df.groupby(['School', 'Major'])['Algorithms and Numerical Methods',


In [95]:
topic_distribution_majors

Unnamed: 0_level_0,Unnamed: 1_level_0,Algorithms and Numerical Methods,Machine Learning,Project Management,Math and Statistics
School,Major,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NTU,Data Science and Artificial Intelligence,46.8,14.2,21.8,17.2
NTU,Economics and Data Science,27.8,34.3,24.0,13.9
NUS,Business Analytics,34.0,20.8,32.6,12.6
NUS,Data Science and Analytics,55.4,4.3,3.5,36.8
NUS,Data Science and Economics,50.0,24.6,3.3,22.1
NUS,Quantitative Finance,28.6,4.9,18.0,48.5
NUS,Statistics,34.0,6.4,7.5,52.1
SMU,Data Science and Analytics,48.4,14.8,22.2,14.5
SMU,Information Systems (Business Analytics),26.7,32.9,39.9,0.5
SMU,Quantitative Finance,19.2,2.6,60.9,17.3


In [96]:
topic_distribution_majors.to_csv("average_topic_distribution_for_majors.csv") # can save the csv then query the result u want from here

### Topic Distribution for each job role

In [97]:
job_data = pd.read_csv("job_offers_categorized.csv")

In [98]:
def topic_distribution_for_each_job(job_data, job_desc):
  job_data["cluster_assigned"]= job_data[job_desc].apply(lambda x: assign_cluster(x, lda_model, dictionary))
  job_data["dominant_topic"] = job_data["cluster_assigned"].apply(lambda x: x[0])
  job_data["topic_keywords"] = job_data["cluster_assigned"].apply(lambda x: x[1])
  job_data["topic_distribution"] = job_data["cluster_assigned"].apply(lambda x: x[2])

In [99]:
topic_distribution_for_each_job(job_data,"job_desc")

In [100]:
job_data.to_csv("lda_topic_distribution_for_jobs.csv")

### Calculating our Results for Different Job Roles
### input: our data (dataframe)
### output: dataframe with all the averages

In [101]:
jobs_df = pd.read_csv("lda_topic_distribution_for_jobs.csv")

In [102]:
def average_topic_distribution_for_jobs(jobs_df):
  jobs_df["Algorithms and Numerical Methods"], jobs_df['Machine Learning'], jobs_df['Project Management'], jobs_df['Math and Statistics'], = 0, 0, 0, 0
  jobs_df = jobs_df.astype({'Algorithms and Numerical Methods':'float',
                            'Machine Learning':'float',
                            'Project Management':'float',
                            'Math and Statistics':'float'})

  for i in range(len(jobs_df)):
    segregate(jobs_df, i)

  td_jobs = round((jobs_df.groupby(['job_type'])['Algorithms and Numerical Methods',
                                    'Machine Learning',
                                    'Project Management',
                                    'Math and Statistics'].mean())*100, 1)
  return td_jobs

In [103]:
topic_distribution_jobs = average_topic_distribution_for_jobs(jobs_df)

  td_jobs = round((jobs_df.groupby(['job_type'])['Algorithms and Numerical Methods',


In [104]:
topic_distribution_jobs

Unnamed: 0_level_0,Algorithms and Numerical Methods,Machine Learning,Project Management,Math and Statistics
job_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business Analyst,25.0,25.0,47.6,2.4
Data Analyst,26.5,26.2,44.1,3.3
Data Scientist,25.1,28.2,44.2,2.5
Quantitative Analyst,20.4,24.2,50.8,4.6
Quantitative Researcher,20.0,26.7,49.4,3.9


In [105]:
topic_distribution_jobs.to_csv("average_topic_distribution_for_jobs.csv")