# Text preprocessing: MR3

Creating clean text for LDA topic modelling.

In [1]:
import re
import numpy as np
import pandas as pd
import pandas_profiling

import os
import glob

from pprint import pprint
import string
from itertools import chain

#NLTK
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
from langdetect import detect
from tqdm import tqdm_notebook #for loops showing progress meter
tqdm_notebook().pandas()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


# Data

## H2020

In [3]:
# All the fields for NLP analysis are merge in the data frame h2020_full
h2020 = pd.read_csv("datasets/data_may20/h2020_clean/h2020_full.csv")
h2020.shape

(29314, 31)

In [4]:
h2020.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,rcn_report,title_report,summary,workPerformed,finalResults,lastUpdateDate,projectID,projectAcronym,relatedFile,url
0,229267,894593,ICARUS,SIGNED,H2020-EU.3.4.7.,SESAR-ER4-31-2019,H2020,INTEGRATED COMMON ALTITUDE REFERENCE SYSTEM FO...,2020-05-01,2022-07-31,...,,,,,,,,,,
1,229284,897004,ISLand,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Isolation and Segregation Landscape. Archaeolo...,2020-11-01,2023-10-31,...,,,,,,,,,,
2,229281,896300,STRETCH,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Smart Textiles for RETrofitting and Monitoring...,2020-09-01,2022-08-31,...,,,,,,,,,,
3,229265,892890,RhythmicPrediction,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Rhythmic prediction in speech perception: are ...,2021-01-01,2022-12-31,...,,,,,,,,,,
4,229235,886828,ASAP,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Advanced Solutions for Asphalt Pavements,2021-09-01,2023-08-31,...,,,,,,,,,,


In [5]:
#h2020.profile_report()

# FILL IN!

MISSING VALUES (_____ projects):
- title: 0
- objective: 0
- summary: 12 378 (48%)
- workPerformed: 12 377 (48%)
- finalResults: 12 388 (48%)

## FP7

In [6]:
# All the fields for NLP analysis are merge in the data frame fp7_full
fp7 = pd.read_csv("datasets/data_may20/fp7_clean/fp7_full.csv")
fp7.shape

(25716, 26)

In [7]:
fp7.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,coordinator,coordinatorCountry,participants,participantCountries,subjects,summary,lastUpdateDate,rcn_report,title_report,projectAcronym
0,203726,115760,ZAPI,ONG,FP7-JTI,IMI-JU-11-2013-04,FP7,Zoonotic Anticipation and Preparedness Initiative,2015-03-01,2020-02-29,...,MERIAL SAS,FR,ACADEMISCH ZIEKENHUIS LEIDEN;STIFTUNG TIERAERZ...,NL;DE;SE;FR;ES,,,,,,
1,109748,320377,NetSat,ONG,FP7-IDEAS-ERC,ERC-AG-PE8,FP7,Networked Pico-Satellite Distributed System Co...,2014-08-01,2019-07-31,...,Zentrum fuer Telematik e.V.,DE,JULIUS-MAXIMILIANS-UNIVERSITAT WURZBURG,DE,,Multi-satellite systems are currently gaining ...,2017-10-30 17:19:45,202963.0,Periodic Report Summary 2 - NETSAT (Networked ...,NetSat
2,188675,615785,EMERGING SUBJECTS,ONG,FP7-IDEAS-ERC,ERC-CG-2013-SH2,FP7,Emerging Subjects of the New Economy: Tracing ...,2014-09-01,2019-06-30,...,UNIVERSITY COLLEGE LONDON,UK,,,,Emerging Subjects has reached a mid-point in i...,2018-01-15 17:25:32,213801.0,Periodic Report Summary 2 - EMERGING SUBJECTS ...,EMERGING SUBJECTS
3,188672,615640,FOREFRONT,ONG,FP7-IDEAS-ERC,ERC-CG-2013-PE6,FP7,Frontiers of Extended Formulations,2014-09-01,2019-08-31,...,UNIVERSITE LIBRE DE BRUXELLES,BE,,,,The FOREFRONT project (Frontiers of Extended F...,2017-07-24 18:20:56,201439.0,Periodic Report Summary 2 - FOREFRONT (Frontie...,FOREFRONT
4,189842,617196,CORRELMAT,ONG,FP7-IDEAS-ERC,ERC-CG-2013-PE3,FP7,Predictive electronic structure calculations f...,2014-07-01,2019-06-30,...,ECOLE POLYTECHNIQUE,FR,,,,"“Correlated electron materials”, i.e. compound...",2017-05-16 10:26:14,197668.0,Mid-Term Report Summary - CORRELMAT (Predictiv...,CORRELMAT


In [8]:
#fp7.profile_report()

# FILL IN!

MISSING VALUES (______ projects):
- title: 0
- objective: 0
- summary: 4 538 (18%)
- workPerformed:  NOT INCLUDED
- finalResults:  NOT INCLUDED

# REVIEW IF IT STILL HOLDS FOR MR3

In summary, fields:
- title and objective are always present and can be used
- summary is present in around half of H2020 projects and 72% FP7 projects
- workPerformed and finalResults are present in around half of H2020 projecsts and not at all in FP7 projects

At the moment, the text for the NLP analysis is going to be constructed by using the biggest constructible string for each project. That will create textual descriptions of projects unequal in length, with some projects having more detailed descriptions than others. If noticed that affects the performance, skewing it more towards one group of projects, the algorithm can be run separately on two or more different groups of projects.
Example:
- projects with small text. descriptions (just title and objective)
- projects with larger text. descriptions (other fields available in addition to title and objective)

# Selected mobility projects (final, from MR2)

In [9]:
mobility_projects = pd.read_csv("datasets/data_may20/final_project_selection_mr2_outlier_removed.csv")
mobility_projects.shape

(926, 13)

In [10]:
mobility_projects.head(2)

Unnamed: 0.1,Unnamed: 0,index,id,acronym,title,objective,summary,workPerformed,finalResults,text,clean_text,score,framework
0,0,0,211625,ASSET,ASSET – Aeronautic Study on Seamless Transport,Airport ground processes still conceal a consi...,,,,ASSET – Aeronautic Study on Seamless Transport...,asset aeronautic study seamless transport airp...,0.572112,FP7
1,1,1,211723,MEFISTO,Methodology for framework programmes’ impact a...,This proposal is responding to the first FP7- ...,The MEFISTO project had three main objectives:...,,,Methodology for framework programmes’ impact a...,methodology framework programme impact assessm...,0.124785,FP7


In [11]:
mobility_projects_ids_list = list(mobility_projects.id)
len(mobility_projects_ids_list)

926

In [12]:
# should we use the full dataset or just the mobility projects

use_full_dataset = False

In [44]:
# IDs shouldn't have changed from MR2 --> MR3

mobility_projects[mobility_projects.id == 211625]

Unnamed: 0.1,Unnamed: 0,index,id,acronym,title,objective,summary,workPerformed,finalResults,text,clean_text,score,framework
0,0,0,211625,ASSET,ASSET – Aeronautic Study on Seamless Transport,Airport ground processes still conceal a consi...,,,,ASSET – Aeronautic Study on Seamless Transport...,asset aeronautic study seamless transport airp...,0.572112,FP7


In [45]:
fp7[fp7.id == 211625]

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,coordinator,coordinatorCountry,participants,participantCountries,subjects,summary,lastUpdateDate,rcn_report,title_report,projectAcronym
16245,90080,211625,ASSET,ONG,FP7-TRANSPORT,AAT-2007-2.2-02;AAT-2007-3.2-02,FP7,ASSET – Aeronautic Study on Seamless Transport,2008-06-01,2011-11-30,...,DEUTSCHES ZENTRUM FUER LUFT - UND RAUMFAHRT EV,DE,ID PARTNERS;IDEMIA IDENTITY & SECURITY FRANCE;...,FR;EL;DE;UK;SK,,,,,,


# Feature engineering

In [13]:
# concatenate FP7 and H2020

df_full = pd.concat([h2020, fp7], axis=0, join='outer', ignore_index=True)

if not use_full_dataset:
    # select just the mobilty projects
    df_full = df_full[df_full.id.isin(mobility_projects_ids_list)]
    
df_full.shape

(926, 31)

In [14]:
df_full.head(2)

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,rcn_report,title_report,summary,workPerformed,finalResults,lastUpdateDate,projectID,projectAcronym,relatedFile,url
245,224495,876943,DAICY,CLOSED,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,H2020,Design and AI for sustainable and safe motorCY...,2019-07-01,2019-09-30,...,429360.0,Periodic Reporting for period 1 - DAICY (Desig...,Problems: \nUrban driving safety \nCO2 emissio...,We studied the European market. The most impor...,"Expected result: Introduction of the safest, s...",2020-01-29 08:45:15,876943.0,DAICY,/docs/results/h2020/876/876943_PS/tarform-prof...,http://www.tarform.com
335,224539,878052,MAAS,CLOSED,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,H2020,MOBILITY AS A SERVICE PLATFORM for employers a...,2019-08-01,2019-11-30,...,440380.0,Periodic Reporting for period 1 - MAAS (MOBILI...,"Problems regarding urban transportation, such ...",Technical feasibility – we have developed a r...,We have achieved a thorough analysis of the ta...,2020-03-20 19:09:32,878052.0,MAAS,/docs/results/h2020/878/878052_PS/maas.png,https://www.mobilityconcept.nl/


In [15]:
#df_full.tail()

In [16]:
# Check that all the projects ids are distinct

df_full[df_full.duplicated(subset=['id'], keep=False)].shape

(0, 31)

In [17]:
df_full.columns.values

array(['rcn', 'id', 'acronym', 'status', 'programme', 'topics',
       'frameworkProgramme', 'title', 'startDate', 'endDate',
       'projectUrl', 'objective', 'totalCost', 'ecMaxContribution',
       'call', 'fundingScheme', 'coordinator', 'coordinatorCountry',
       'participants', 'participantCountries', 'subjects', 'rcn_report',
       'title_report', 'summary', 'workPerformed', 'finalResults',
       'lastUpdateDate', 'projectID', 'projectAcronym', 'relatedFile',
       'url'], dtype=object)

## Extract fields to be used in NLP

- title, objective, summary, workPerformed, finalResults + other basic info (id, acronym)

In [18]:
df = df_full[['id', 'acronym', 'title', 'objective', 'summary', 'workPerformed', 'finalResults']].sort_values('id').reset_index(drop=True)

In [19]:
# replace nan with empty strings
df.replace(np.nan, '', regex=True, inplace=True)

In [20]:
# concatenate textual (NL) data

df['text'] = df[['title', 'objective', 'summary', 'workPerformed', 'finalResults']].apply(lambda x: ' '.join(x), axis=1)

docs = list(df['text'].values)

len(docs)

926

In [21]:
type(df.text[0]) # THIS IS WHAT IS CONSIDERED THE DOCUMENT FOR THE NLP POV!

str

In [22]:
#df.loc[df.shape[0]-1,'text']

In [23]:
#print(df.loc[2,'text'])

# Text pre-processing

In [24]:
# break each doc into sentences
df['sentences'] = df.text.progress_map(sent_tokenize)

HBox(children=(FloatProgress(value=0.0, max=926.0), HTML(value='')))




In [25]:
df.sentences[0][:3]

['Nanoelectronics for Safe, Fuel Efficient and Environment Friendly Automotive Solutions The societal need for a transport infrastructure based upon the availability of safe, fuel-efficient and environmental-friendly cars is clearly recognized by the European citizens and the European Commission.',
 'The fulfillment of this ambition is not to be taken for granted, as it requires the development of a host of  automotive technologies, systems, software and tools.',
 'It is the objective of this project to create an integrated automotive control platform, enabled by breakthroughs in the areas of efficient fuel consumption, reduced CO2 emission and safe driving, to be achieved by the development of nanoelectronic components, subsystems and architectures.']

In [26]:
# INTO TOKENS

df['tokens'] = df['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) \
                                                                   for sentence in sentences])

HBox(children=(FloatProgress(value=0.0, max=926.0), HTML(value='')))




In [27]:
len(df['tokens'][0][0])

41

# Lemmatization with POS tagging

In [28]:
from nltk import pos_tag

In [29]:
df['pos_tokens'] = df['tokens'].progress_map(lambda tokens: [pos_tag(token) for token in tokens])

HBox(children=(FloatProgress(value=0.0, max=926.0), HTML(value='')))




In [30]:
df.pos_tokens[0][0][:6]

[('Nanoelectronics', 'NNS'),
 ('for', 'IN'),
 ('Safe', 'NNP'),
 (',', ','),
 ('Fuel', 'NNP'),
 ('Efficient', 'NNP')]

In [31]:
len(df.pos_tokens)

926

In [32]:
# Inspired from https://stackoverflow.com/a/15590384
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [33]:
# Lemmatizing each word with its POS tag: that way, all the verbs are lemmatize to infinitive,
# all the nouns to their dictionary forms etc.


df['lemmas'] = df['pos_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' 
            else el[0] for el in pos_tokens  # each el in pos_tokens is a tuple of a token and their POS tag
        ] 
        for pos_tokens in list_tokens_POS
    ]
)

HBox(children=(FloatProgress(value=0.0, max=926.0), HTML(value='')))




In [34]:
len(df.lemmas)

926

In [35]:
df.lemmas[0][0][:8]

['Nanoelectronics',
 'for',
 'Safe',
 ',',
 'Fuel',
 'Efficient',
 'and',
 'Environment']

# Removing stop words

In [36]:
stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', \
                   'use', 'would', 'can', 'show', 'think', 'deduce', 'prove', 'experiment']
stopwords_research = ['project', 'research', 'initiative', 'h2020', 'fp7', 'science', 'investigation']

stopwords_other = ["also", "develop", "one", "new", "include", "well", "work", "provide", "approach", \
                   "different", "time", "good", "result", 'activity', 'development', 'key', 'analysis', \
                  'impact', 'tool', 'develop', 'method', 'level', 'task', 'change', 'large', 'objective', \
                  'perform', 'time', 'good', 'increase', 'process']

In [37]:
my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_research + stopwords_other
len(my_stopwords)

237

In [38]:
from itertools import chain # to flatten list of sentences of tokens into list of tokens

In [39]:
df['tokens'] = df['lemmas'].map(lambda sentences: list(chain.from_iterable(sentences)))

In [40]:
df['tokens'][0][:8]

['Nanoelectronics',
 'for',
 'Safe',
 ',',
 'Fuel',
 'Efficient',
 'and',
 'Environment']

In [41]:
# remove pre-defined stopwords, tokens of length 1 and any tokens that are not alphabetic
df['tokens'] = df['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha()
                                                   and
                                                   token.lower() not in my_stopwords
                                                   and len(token) > 1])

In [42]:
df['tokens'][0][:8]

['nanoelectronics',
 'safe',
 'fuel',
 'efficient',
 'environment',
 'friendly',
 'automotive',
 'solutions']

In [43]:
df['tokens'].head()

0    [nanoelectronics, safe, fuel, efficient, envir...
1    [flight, dynamic, control, bird, insect, insec...
2    [acare, goals, progress, evaluation, acare, ad...
3    [strengthening, railway, vehicles, center, fac...
4    [clean, sky, support, action, sky, joint, tech...
Name: tokens, dtype: object

In [46]:
len(df.tokens)

926

In [47]:
" ".join(df.tokens[0])

'nanoelectronics safe fuel efficient environment friendly automotive solutions societal transport infrastructure base upon availability safe car clearly recognize european citizen european commission fulfillment ambition grant require host automotive technology system software create integrated automotive control platform enable breakthrough area efficient fuel consumption reduce emission safe driving achieve nanoelectronic component subsystem architecture goal line target define eniac strategic agenda target parameter following reduction annual cost cause road accident estimate europe improvement fuel consumption efficiency reduction emission line reduction plan subdivide following workpackages definition carrier specification design optimized control system novel automotive technology integration sub system design reliability test yield prototyping functional verification objectives dissemination release deliverables execute consortium incorporate major european industrial institutio

### Create one big string for each project's cleaned text

In [48]:
df['clean_text'] = df["tokens"].apply(lambda x: " ".join(x))

In [49]:
type(df.clean_text[0])

str

In [50]:
df.head(2)

Unnamed: 0,id,acronym,title,objective,summary,workPerformed,finalResults,text,sentences,tokens,pos_tokens,lemmas,clean_text
0,120009,SE2A,"Nanoelectronics for Safe, Fuel Efficient and E...",The societal need for a transport infrastructu...,,,,"Nanoelectronics for Safe, Fuel Efficient and E...","[Nanoelectronics for Safe, Fuel Efficient and ...","[nanoelectronics, safe, fuel, efficient, envir...","[[(Nanoelectronics, NNS), (for, IN), (Safe, NN...","[[Nanoelectronics, for, Safe, ,, Fuel, Efficie...",nanoelectronics safe fuel efficient environmen...
1,204513,DCBIF,Flight dynamics and control of birds and insects,"Insects bristle with sensors, but how do they ...",This project aimed to develop an understanding...,,,Flight dynamics and control of birds and insec...,[Flight dynamics and control of birds and inse...,"[flight, dynamic, control, bird, insect, insec...","[[(Flight, NNP), (dynamics, NNS), (and, CC), (...","[[Flight, dynamic, and, control, of, bird, and...",flight dynamic control bird insect insects bri...


In [54]:
# save df as it is to CSVs - careful, lists are converted to strings

df.to_csv("datasets/data_may20/outputs/mobility_projects_clean_text.csv")

In [53]:
# save it to pickle as well, so that lists can be loaded back as lists

df.to_pickle("datasets/data_may20/outputs/mobility_projects_clean_text.pkl")

In [55]:
# to parquet - DO IT LATER

#DataFrame.to_parquet(self, fname, engine='auto', compression='snappy', index=None, partition_cols=None, **kwargs)

In [56]:
# read the dataset from the save CSV (clean_text column is important)

#df = pd.read_csv("datasets/data_oct19/auxiliar/full_dataset_clean_text.csv")

In [57]:
type(df.clean_text[0])

str

# Create a vocabulary

Using the tokenised and clean texts, create a full vocabulary to be used by ML algorithms.

In [58]:
def vocabulary_construction_old(text, remove_singletons = True):
    """
    Old function, left for comparison.
    """
    
    voc = []
    i = 0
    
    for txt in text:
        if i % 1000 == 0:
            print("Iteration ", i)
        i += 1
            
        # separate into sentences
        #sentences = nltk.sent_tokenize(text)

        # split into words
        tokens = nltk.word_tokenize(txt)

        # keep only word tokens (words), and make them all lowercase
        words = [w.lower() for w in tokens if w.isalpha()]

        # exclude stopwords in english
        filtered_words = [w for w in words if w not in stopwords.words('english')]

        # stem the words using PorterStemmer (nltk)
        porter = nltk.stem.PorterStemmer()
        stemmed = [porter.stem(w) for w in filtered_words]

        # lemmatize with WordNetLemmatizer
        wnl = nltk.WordNetLemmatizer()
        lemmas = [wnl.lemmatize(w) for w in stemmed]
        
        voc += lemmas
        #print(voc)

    dist = nltk.probability.FreqDist(voc) # dist is of type nltk.probability.FreqDist- contains a dict.
        
    
    # if remove_singletons is set to True, it will remove from the vocabulary all the words that appear only once
    if remove_singletons:
        singletons = [w for w in dist.keys() if dist[w] == 1]
        voc = set(voc) - set(singletons)
    
    return set(voc), dist

In [59]:
def vocabulary_construction(token_list, remove_singletons=False):
    """
    Using field 'tokens' in the dataframe df, construct a FreqDist,
    as well as just a regular set that stands for a vocabulary.
    """
    voc = list(itertools.chain.from_iterable(token_list))
        
    dist = nltk.probability.FreqDist(voc) # dist is of type nltk.probability.FreqDist- contains a dict.
    
    # if remove_singletons is set to True, it will remove from the vocabulary all the words that appear only once
    if remove_singletons:
        singletons = [w for w in dist.keys() if dist[w] == 1]
        voc = set(voc) - set(singletons)
        
    return dist, set(voc)

In [60]:
def vocabulary_construction_from_text(clean_text_list, remove_singletons=False):
    """
    For each project, clean text is constructed from the tokens takens from that text
    and concatenated into one string.
    """
    voc = [w for segm in clean_text_list for w in segm.split()]
        
    dist = nltk.probability.FreqDist(voc) # dist is of type nltk.probability.FreqDist- contains a dict.
    
    # if remove_singletons is set to True, it will remove from the vocabulary all the words that appear only once
    if remove_singletons:
        singletons = [w for w in dist.keys() if dist[w] == 1]
        voc = set(voc) - set(singletons)
        
    return dist, set(voc)

In [61]:
# test vocabulary_construction
exmpl_text = ["My little cat went to little wood called Wood and met Little dog.", \
              " It was raining cats that day."]

exmpl_text2 = ['little cat wood dog',
              'rain cat day dog',
              'dog wood']

exmpl_tokens = [['little', 'cat', 'wood', 'dog'],
                ['rain', 'cat', 'day', 'dog'],
                ['dog','wood']
    ]

#dist_exmpl, dist1 = vocabulary_construction_old(exmpl_text)

#dist_exmpl

In [62]:
import itertools
from itertools import chain

In [63]:
dist, voc2 = vocabulary_construction(exmpl_tokens)
print(len(voc2))
voc2

6


{'cat', 'day', 'dog', 'little', 'rain', 'wood'}

In [64]:
dist, voc2 = vocabulary_construction_from_text(exmpl_text2)
print(len(voc2))
voc2

6


{'cat', 'day', 'dog', 'little', 'rain', 'wood'}

In [65]:
dist, voc2 = vocabulary_construction(exmpl_tokens, remove_singletons=True)
print(len(voc2))
voc2

3


{'cat', 'dog', 'wood'}

In [66]:
dist, voc2 = vocabulary_construction_from_text(exmpl_text2, remove_singletons=True)
print(len(voc2))
voc2

3


{'cat', 'dog', 'wood'}

In [67]:
dist

FreqDist({'dog': 3, 'cat': 2, 'wood': 2, 'little': 1, 'rain': 1, 'day': 1})

In [68]:
# Construct vocabulary using all the projects
execute_vocabulary_construction = True

if execute_vocabulary_construction:
    freqdist_full, vocabulary = vocabulary_construction(list(df.tokens), remove_singletons=True)
    len(vocabulary)

In [73]:
import json
voc = list(vocabulary)

save_vocabulary = False

In [74]:
if save_vocabulary:
    print(voc[0:20])

    with open('datasets/data_may20/outputs/vocabulary_mobility_projects_nosingletons.json', 'w', encoding='utf-8') as f:
        json.dump(voc, f, ensure_ascii=False, indent=4)


['quasi', 'ana', 'intensity', 'registered', 'homogenize', 'depart', 'tolling', 'fcb', 'presently', 'wrp', 'reichel', 'prototyping', 'kom', 'arrow', 'personalised', 'resonator', 'poles', 'piece', 'cumulated', 'syndrome']


In [75]:
# read the vocabulary
with open('datasets/data_may20/outputs/vocabulary_mobility_projects_nosingletons.json', 'r') as f:
        voc_read = json.load(f)

print(type(voc_read), len(voc_read))

<class 'list'> 15023


In [76]:
print(voc_read[0:20])

['quasi', 'ana', 'intensity', 'registered', 'homogenize', 'depart', 'tolling', 'fcb', 'presently', 'wrp', 'reichel', 'prototyping', 'kom', 'arrow', 'personalised', 'resonator', 'poles', 'piece', 'cumulated', 'syndrome']


In [77]:
# some SANITY CHECKS

words_sanity = ["project", "mumba_bumba", "mobility", "h2020", "transport", "krebs", "10", "air", "aircraft", \
               "traffic", "aviation", "passenger", "flight", "transportation", "airline", "mobil", \
               "aviat", "airlin", "airport", "research", "innovation"]

for w in words_sanity:
    print(w, " ",w in voc_read)


project   False
mumba_bumba   False
mobility   True
h2020   False
transport   True
krebs   False
10   False
air   True
aircraft   True
traffic   True
aviation   True
passenger   True
flight   True
transportation   True
airline   True
mobil   True
aviat   False
airlin   False
airport   True
research   False
innovation   True


In [78]:
# check some extra words

words_extra = ["intermodality", "intermodal", "crossmodality", "crossmodal", "comodal", "comodality", "also", \
               "one", "approach"]

for w in words_extra:
    print(w, " ",w in voc_read)

intermodality   True
intermodal   True
crossmodality   False
crossmodal   True
comodal   False
comodality   True
also   False
one   False
approach   False


# Features

#### CountVectorizer by sklearn

Convert a collection of text documents to a matrix of token counts

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

### Bigrams and trigrams


# TO DO!

In [34]:
# test vocabulary_construction
exmpl_text = ["My little cat went to little wood called Wood and met Little dog.", \
              " It was raining cats that day."]

exmpl_tokens = [['little', 'cat', 'wood', 'dog'],
                ['rain', 'cat', 'day', 'dog', 'cat'],
                ['dog','wood']
    ]

exmpl_docs = [" ".join(x) for x in exmpl_tokens]
print(exmpl_docs)

dist, voc2 = vocabulary_construction(exmpl_tokens, remove_singletons = False)
print(len(voc2))
voc2

['little cat wood dog', 'rain cat day dog cat', 'dog wood']
6


{'cat', 'day', 'dog', 'little', 'rain', 'wood'}

In [35]:
from nltk.util import bigrams
from nltk.util import everygrams

In [36]:
bigram_trigram_list = []

for x in exmpl_tokens:
    bigram_trigram_list += list(everygrams(x, min_len=2, max_len=3))

# CREATE A GENERATOR!
allgram_generator = (" ".join(x) for x in bigram_trigram_list)

set(allgram_generator)

{'cat day',
 'cat day dog',
 'cat wood',
 'cat wood dog',
 'day dog',
 'day dog cat',
 'dog cat',
 'dog wood',
 'little cat',
 'little cat wood',
 'rain cat',
 'rain cat day',
 'wood dog'}

In [37]:
print(*map(' '.join, bigram_trigram_list), sep=', ')

little cat, cat wood, wood dog, little cat wood, cat wood dog, rain cat, cat day, day dog, dog cat, rain cat day, cat day dog, day dog cat, dog wood


In [39]:
# with building of vocabulary
"""
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=10000,
                                stop_words='english')
                                """

#use pre-built vocabulary

tf_vectorizer = CountVectorizer(vocabulary= voc2)

tf = tf_vectorizer.fit_transform(exmpl_docs).toarray()

In [40]:
tf.shape

(3, 19)

In [41]:
type(tf)

numpy.ndarray

In [42]:
tf

array([[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

In [43]:
tf_vectorizer.get_feature_names()

['cat',
 'cat day',
 'cat day dog',
 'cat wood',
 'cat wood dog',
 'day',
 'day dog',
 'day dog cat',
 'dog',
 'dog cat',
 'dog wood',
 'little',
 'little cat',
 'little cat wood',
 'rain',
 'rain cat',
 'rain cat day',
 'wood',
 'wood dog']

#### TfidfVectorizer by sklearn

Convert a collection of raw documents to a matrix of TF-IDF features.

Equivalent to CountVectorizer followed by TfidfTransformer.

In [60]:
tf_vectorizer = TfidfVectorizer(vocabulary=set(voc2))

tf = tf_vectorizer.fit_transform(exmpl_docs).toarray()

In [61]:
print(tf.shape)
tf

(3, 6)


array([[0.4804584 , 0.        , 0.37311881, 0.63174505, 0.        ,
        0.4804584 ],
       [0.70443024, 0.46312056, 0.27352646, 0.        , 0.46312056,
        0.        ],
       [0.        , 0.        , 0.61335554, 0.        , 0.        ,
        0.78980693]])

In [62]:
""" 
For guided LDA to work, this needs to be converted into a matrix of integer.
"""

tf2 = np.rint(tf * 100).astype(int)
tf2

array([[48,  0, 37, 63,  0, 48],
       [70, 46, 27,  0, 46,  0],
       [ 0,  0, 61,  0,  0, 79]])

In [63]:
# inspect the values that TF-IDF gives when working with 
docs = list(df['clean_text'].values)
tf_vectorizer = TfidfVectorizer(vocabulary = set(voc_read))
            
tf = tf_vectorizer.fit_transform(docs)

print(tf.shape)

(51235, 208181)


In [64]:
tf2 = np.rint(tf * 100).astype(int)

# Final vocabulary

In [190]:
#voc_final = voc_read
voc_final = voc
type(voc_final)

list

In [191]:
len(voc_final)

208153

# Validation

In [198]:
#files = os.listdir("mr2_results/lda_outputs/")

files_lda = glob.glob("mr2_results/lda_outputs/*.csv")

# just v2.X files
files_lda = glob.glob("mr2_results/lda_outputs/*" + version + ".csv")

len(files_lda)

30