In [38]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [39]:
meta = pd.read_csv('Articoli Taggati v8.csv', sep=";", skipinitialspace = True, quotechar='"')
#meta = meta.replace(np.nan, '', regex=True)
del(meta['Unnamed: 0'])
del(meta['Txt'])

In [40]:
meta.head()

Unnamed: 0,ID,Titolo,Data,Trend,Anno,Mese,Day,Week,Keywords,Tags,Tech,Company,Vertical,Application,Location,Tag
0,7f9f2aea41caf2f034af9b5423abb95f,investor group european oil gas major credible...,2020-05-14,Circular Economy Sustainability and Zero Impac...,2020,5,14,20,"netzero,oil,tpi,oil gas,emissions",,,,,,,
1,c812e0a3418d624ae60df3ec9e19d184,pandemic drone spot infected,2020-04-03,Robotics and Human/Machine Hybridization,2020,4,3,14,"drone,detect,respiratory,infected,heart",Tech: Drones,Drones,,,,,
2,61268fb46585bba4ac828cc277b69ecb,europe green deal eu commission president ursu...,2019-12-15,High Tech Farming,2019,12,15,50,"europe,von,december th cynthia,man,announces",,,,,,,
3,d3e4f8a85f1c96d9839de642e856645f,green swans exponential decade conversation jo...,2020-05-19,Digital Politics and Smart Citizenship,2020,5,19,21,"elkington,swans,green swans,green,swan",,,,,,,
4,879f738c66345cf45682fc3e2117e743,sustainable nation film showcases israeli wate...,2020-04-26,Smart & Digital Water,2020,4,26,17,"film,nation,israel,water,sustainable",,,,,,,


In [41]:
print ("Articoli: %d" % len(meta['ID']))

Articoli: 741


In [42]:
print ("articoli senza tag: %d" % meta['Tags'].isnull().values.sum())

articoli senza tag: 452


In [43]:
print ("articoli senza keyword: %d" % meta['Keywords'].isnull().values.sum())

articoli senza keyword: 0


In [44]:
meta['Keywords'][1].split(',')

['drone', 'detect', 'respiratory', 'infected', 'heart']

In [45]:
tags = pd.read_excel("Keywords to Tags V2.xlsx", encoding="utf8", error_bad_lines=False)
tags = tags.replace(np.nan, '', regex=True)
del(tags['Tag 1'])
del(tags['Tag 2'])
del(tags['Tag 3'])

In [46]:
tags.head()

Unnamed: 0,Words,Tags
0,aarogya setu,Tech: Mobile app|Application: contact tracing
1,academic,Vertical: University
2,acquiring,Vertical: M&A
3,ad,Vertical: Advertising
4,ad networks,Vertical: Advertising


In [47]:
articoli = meta[['ID','Keywords']]

In [48]:
articoli.head()

Unnamed: 0,ID,Keywords
0,7f9f2aea41caf2f034af9b5423abb95f,"netzero,oil,tpi,oil gas,emissions"
1,c812e0a3418d624ae60df3ec9e19d184,"drone,detect,respiratory,infected,heart"
2,61268fb46585bba4ac828cc277b69ecb,"europe,von,december th cynthia,man,announces"
3,d3e4f8a85f1c96d9839de642e856645f,"elkington,swans,green swans,green,swan"
4,879f738c66345cf45682fc3e2117e743,"film,nation,israel,water,sustainable"


In [49]:
len(articoli['ID'])

741

In [50]:
words = pd.concat([pd.Series(row['ID'], row['Keywords'].split(',')) 
           for _, row in articoli.iterrows()]).reset_index().rename(columns={"index": "Words", 0: "ID"})

In [51]:
words.head()

Unnamed: 0,Words,ID
0,netzero,7f9f2aea41caf2f034af9b5423abb95f
1,oil,7f9f2aea41caf2f034af9b5423abb95f
2,tpi,7f9f2aea41caf2f034af9b5423abb95f
3,oil gas,7f9f2aea41caf2f034af9b5423abb95f
4,emissions,7f9f2aea41caf2f034af9b5423abb95f


In [64]:
res = tags.merge(words, on="Words", how='left')

In [65]:
res.head()

Unnamed: 0,Words,Tags,ID
0,aarogya setu,Tech: Mobile app|Application: contact tracing,
1,academic,Vertical: University,cf4a502cbc3be4bf184002d5db67ac81
2,academic,Vertical: University,c3867e2ca29ffade71f104301bf2a743
3,acquiring,Vertical: M&A,
4,ad,Vertical: Advertising,0e2ef6bd7619374e0531d1551ab2141e


In [66]:
#res.to_csv('prova.csv', sep=";")

In [67]:
res = res.groupby('ID').agg(lambda x: x.tolist()).rename({'Tags': 'ComputedTags'}, axis=1)

In [68]:
res.head()

Unnamed: 0_level_0,Words,ComputedTags
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
005b6d185b27aa2b15483734fac9c657,[solar],[]
00b4785a7e2de5a5df9b6ee9adba52e7,[florida],[Location: Florida]
021bf18c94ed25c0ed697b8e73f588cb,"[innovation, university]","[, ]"
024e39e4c912e043aceb022b442f8a6e,[silicon valley],[]
02783899e72f373f5fa8b56cb84b7d1a,"[health, rpa, uipath]","[, , ]"


In [69]:
df = meta.merge(res, on="ID", how='left')

In [82]:
df = df.replace(np.nan, '', regex=True)

In [91]:
df.head()

Unnamed: 0,ID,Titolo,Data,Trend,Anno,Mese,Day,Week,Keywords,Tags,Tech,Company,Vertical,Application,Location,Tag,Words,ComputedTags
0,7f9f2aea41caf2f034af9b5423abb95f,investor group european oil gas major credible...,2020-05-14,Circular Economy Sustainability and Zero Impac...,2020,5,14,20,"netzero,oil,tpi,oil gas,emissions",,,,,,,,"[oil, oil gas]","[, ]"
1,c812e0a3418d624ae60df3ec9e19d184,pandemic drone spot infected,2020-04-03,Robotics and Human/Machine Hybridization,2020,4,3,14,"drone,detect,respiratory,infected,heart",Tech: Drones,Drones,,,,,,"[drone, heart]","[Tech: Drones, ]"
2,61268fb46585bba4ac828cc277b69ecb,europe green deal eu commission president ursu...,2019-12-15,High Tech Farming,2019,12,15,50,"europe,von,december th cynthia,man,announces",,,,,,,,[europe],[Location: Europe]
3,d3e4f8a85f1c96d9839de642e856645f,green swans exponential decade conversation jo...,2020-05-19,Digital Politics and Smart Citizenship,2020,5,19,21,"elkington,swans,green swans,green,swan",,,,,,,,,
4,879f738c66345cf45682fc3e2117e743,sustainable nation film showcases israeli wate...,2020-04-26,Smart & Digital Water,2020,4,26,17,"film,nation,israel,water,sustainable",,,,,,,,"[israel, sustainable, water]","[, , ]"


In [59]:
print ("Numero: %d" % len(df['ID']))

Numero: 741


In [89]:
def string_to_list(text):
    str_list = text
    str_list = list(filter(None, str_list))
    str_list = list(set(str_list))
    
    str_list = ", " . join(str_list)
    
    return str_list
    
string_to_list(df['ComputedTags'][736])

''

In [93]:
df['ComputedTags'] = df['ComputedTags'].apply(lambda x: string_to_list(x))

In [94]:
df.head()

Unnamed: 0,ID,Titolo,Data,Trend,Anno,Mese,Day,Week,Keywords,Tags,Tech,Company,Vertical,Application,Location,Tag,Words,ComputedTags
0,7f9f2aea41caf2f034af9b5423abb95f,investor group european oil gas major credible...,2020-05-14,Circular Economy Sustainability and Zero Impac...,2020,5,14,20,"netzero,oil,tpi,oil gas,emissions",,,,,,,,"[oil, oil gas]",
1,c812e0a3418d624ae60df3ec9e19d184,pandemic drone spot infected,2020-04-03,Robotics and Human/Machine Hybridization,2020,4,3,14,"drone,detect,respiratory,infected,heart",Tech: Drones,Drones,,,,,,"[drone, heart]",Tech: Drones
2,61268fb46585bba4ac828cc277b69ecb,europe green deal eu commission president ursu...,2019-12-15,High Tech Farming,2019,12,15,50,"europe,von,december th cynthia,man,announces",,,,,,,,[europe],Location: Europe
3,d3e4f8a85f1c96d9839de642e856645f,green swans exponential decade conversation jo...,2020-05-19,Digital Politics and Smart Citizenship,2020,5,19,21,"elkington,swans,green swans,green,swan",,,,,,,,,
4,879f738c66345cf45682fc3e2117e743,sustainable nation film showcases israeli wate...,2020-04-26,Smart & Digital Water,2020,4,26,17,"film,nation,israel,water,sustainable",,,,,,,,"[israel, sustainable, water]",


In [95]:
df.to_csv('prova.csv', sep=";")