In [3]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
meta = pd.read_csv('Articoli Taggati v8.csv', sep=";", skipinitialspace = True, quotechar='"')
meta = meta.replace(np.nan, '', regex=True)
del(meta['Unnamed: 0'])
del(meta['Txt'])

In [21]:
meta.head()

Unnamed: 0,ID,Titolo,Data,Trend,Anno,Mese,Day,Week,Keywords,Tags_x,...,Application_x,Location_x,Tag_x,Tags_y,Tech_y,Company_y,Vertical_y,Application_y,Location_y,Tag_y
0,c812e0a3418d624ae60df3ec9e19d184,pandemic drone spot infected,03/04/2020,Robotics and Human/Machine Hybridization,2020,4,3,14,"drone,detect,respiratory,infected,heart,pandem...",Tech: Drones,...,,,,Tech: Drones,Drones,,,,,
1,0e2ef6bd7619374e0531d1551ab2141e,census data anonymous expected,07/04/2020,Privacy and Security by design,2020,4,7,15,"census,cookies,ad,tracking,adobe,file,party,we...","Company: Adobe, Vertical: Advertising",...,,,,"Company: Adobe, Vertical: Advertising",,Adobe,Advertising,,,
2,8602d8791a1754506303dcfd9a7b79e3,common pandemic scams highlighted domain name ...,11/05/2020,Cyber Defense and Fight against Fakes,2020,5,11,20,"scams,fake,information,domain names,sites,doma...",,...,,,,,,,,,,
3,2157ac6d12e063b6a38c5c0491e0b322,questions rapidly unfolding future smart fabrics,11/05/2020,Workforce Transformation,2020,5,11,20,"unfolding,analyze,signs,monitor,vital,question...",Tag: Future,...,,,Future,Tag: Future,,,,,,Future
4,3c06ef110a3efad83733ad876816e16f,things ceo expect remote work life,17/05/2020,Workforce Transformation,2020,5,17,20,"productivity,im,bump,remote,meetings,working,z...",,...,,,,,,,,,,


In [22]:
print ("Numero: %d" % len(meta['ID']))

Numero: 439


In [6]:
meta['Keywords'][1].split(',')

['census',
 'cookies',
 'ad',
 'tracking',
 'adobe',
 'file',
 'party',
 'website',
 'ads',
 'websites',
 'third',
 'third party',
 'browser',
 'com',
 'privacy',
 'page',
 'networks',
 'anonymous',
 'representation',
 'configuration',
 'net',
 'performance',
 'visitors',
 'data',
 'ensure']

In [7]:
tags = pd.read_excel("Keywords to Tags V2.xlsx", encoding="utf8", error_bad_lines=False)
tags = tags.replace(np.nan, '', regex=True)
del(tags['Tag 1'])
del(tags['Tag 2'])
del(tags['Tag 3'])

In [8]:
tags.head()

Unnamed: 0,Words,Tags
0,aarogya setu,Tech: Mobile app|Application: contact tracing
1,academic,Vertical: University
2,acquiring,Vertical: M&A
3,ad,Vertical: Advertising
4,ad networks,Vertical: Advertising


In [9]:
articoli = meta[['ID','Keywords']]

In [10]:
articoli.head()

Unnamed: 0,ID,Keywords
0,c812e0a3418d624ae60df3ec9e19d184,"drone,detect,respiratory,infected,heart,pandem..."
1,0e2ef6bd7619374e0531d1551ab2141e,"census,cookies,ad,tracking,adobe,file,party,we..."
2,8602d8791a1754506303dcfd9a7b79e3,"scams,fake,information,domain names,sites,doma..."
3,2157ac6d12e063b6a38c5c0491e0b322,"unfolding,analyze,signs,monitor,vital,question..."
4,3c06ef110a3efad83733ad876816e16f,"productivity,im,bump,remote,meetings,working,z..."


In [11]:
words = pd.concat([pd.Series(row['ID'], row['Keywords'].split(',')) 
           for _, row in articoli.iterrows()]).reset_index().rename(columns={"index": "Words", 0: "ID"})

In [12]:
words.head()

Unnamed: 0,Words,ID
0,drone,c812e0a3418d624ae60df3ec9e19d184
1,detect,c812e0a3418d624ae60df3ec9e19d184
2,respiratory,c812e0a3418d624ae60df3ec9e19d184
3,infected,c812e0a3418d624ae60df3ec9e19d184
4,heart,c812e0a3418d624ae60df3ec9e19d184


In [13]:
res = tags.merge(words, on="Words", how='left')

In [14]:
res.head()

Unnamed: 0,Words,Tags,ID
0,aarogya setu,Tech: Mobile app|Application: contact tracing,
1,academic,Vertical: University,cf4a502cbc3be4bf184002d5db67ac81
2,academic,Vertical: University,c3867e2ca29ffade71f104301bf2a743
3,acquiring,Vertical: M&A,cad8439052150cc0cb779feaa0a2a50d
4,acquiring,Vertical: M&A,9d3d71da7640282e1e4a788a1dd361a3


In [15]:
res = res.groupby('ID').agg(lambda x: x.tolist())

In [16]:
res.head()

Unnamed: 0_level_0,Words,Tags
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0041230019f3679d60b2900af083d41d,[european],[Location: Europe]
010ce8162408ed1d3d3dab7b9a852736,[automotive],[Vertical: Automotive]
024e39e4c912e043aceb022b442f8a6e,"[amazon, bay area, seattle, silicon valley, tech]","[Company: Amazon, Location: San Francisco, , , ]"
02783899e72f373f5fa8b56cb84b7d1a,"[agencies, health, hospital, rpa, uipath]","[Vertical: Advertising, , , , ]"
02e960a1e7d6d7fd4b4bae904343bdf1,"[agency, batteries, game, kickstarter]","[Vertical: Advertising, Tech: Batteries, , ]"


In [17]:
df = meta.merge(res, on="ID", how='left')

In [18]:
df.head()

Unnamed: 0,ID,Titolo,Data,Trend,Anno,Mese,Day,Week,Keywords,Tags_x,...,Tag_x,Tags_y,Tech_y,Company_y,Vertical_y,Application_y,Location_y,Tag_y,Words,Tags
0,c812e0a3418d624ae60df3ec9e19d184,pandemic drone spot infected,03/04/2020,Robotics and Human/Machine Hybridization,2020,4,3,14,"drone,detect,respiratory,infected,heart,pandem...",Tech: Drones,...,,Tech: Drones,Drones,,,,,,"[airports, computer vision, drone, heart, inno...","[Vertical: Aviation, Application: Computer Vis..."
1,0e2ef6bd7619374e0531d1551ab2141e,census data anonymous expected,07/04/2020,Privacy and Security by design,2020,4,7,15,"census,cookies,ad,tracking,adobe,file,party,we...","Company: Adobe, Vertical: Advertising",...,,"Company: Adobe, Vertical: Advertising",,Adobe,Advertising,,,,"[ad, adobe, ads, privacy]","[Vertical: Advertising, Company: Adobe, Vertic..."
2,8602d8791a1754506303dcfd9a7b79e3,common pandemic scams highlighted domain name ...,11/05/2020,Cyber Defense and Fight against Fakes,2020,5,11,20,"scams,fake,information,domain names,sites,doma...",,...,,,,,,,,,"[cybersecurity, health, security]","[Application: Cybersecurity, , ]"
3,2157ac6d12e063b6a38c5c0491e0b322,questions rapidly unfolding future smart fabrics,11/05/2020,Workforce Transformation,2020,5,11,20,"unfolding,analyze,signs,monitor,vital,question...",Tag: Future,...,Future,Tag: Future,,,,,,Future,[future],[Tag: Future]
4,3c06ef110a3efad83733ad876816e16f,things ceo expect remote work life,17/05/2020,Workforce Transformation,2020,5,17,20,"productivity,im,bump,remote,meetings,working,z...",,...,,,,,,,,,[working remotely],[]


In [23]:
print ("Numero: %d" % len(df['ID']))

Numero: 439
