In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
with open('trend_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [3]:
with open('trend_tfidfconverter', 'rb') as training_model:
    tfidfconverter = pickle.load(training_model)

## Load and clean data

In [4]:
startup = pd.read_csv('DB Startups v2.csv', sep=";")
del(startup['Unnamed: 0'])
startup.head()

Unnamed: 0,Nome,Text
0,CERCAOFFICINA,cercaofficina a portal to compare quotes for r...
1,CAREPY,carepy created digital lassistente for medicat...
2,AMBIENSVR,ambiensvr a startup that develops projects or ...
3,COCOAPP,A software for handling reservations for beach
4,FLYFREE AIRWAYS,Enable to share private flights for small busi...


In [5]:
print ("Numero totale di startup: %d" % startup['Nome'].count())

Numero totale di startup: 91


## Apply Tags

In [6]:
with open('stop_words', 'rb') as stopwords_dump:## Load and clean data
    stop_words = pickle.load(stopwords_dump)

In [7]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [8]:
startup['Txt'] = startup['Text'].apply(lambda x: remove_stopwords(x))

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import re

corpus = startup['Txt']
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
feature_names=cv.get_feature_names()

In [11]:
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 1))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extract_keys_for_row(doc, n):  
    #generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    
    #extract only the top n
    keywords = extract_topn_from_vector(feature_names,sorted_items, n)
    
    kw = []
    for k in keywords:
        kw.append(k)
        
    kw2 = []
    for k in kw:
        if len(k.split()) == 2:
            kw2.append (k.split()[0].strip())
            kw2.append (k.split()[1].strip())
            
        if len(k.split()) == 3:
            kw2.append (k.split()[0].strip() + ' ' + k.split()[1].strip())
            kw2.append (k.split()[1].strip() + ' ' + k.split()[2].strip())
            kw2.append (k.split()[0].strip())
            kw2.append (k.split()[1].strip())
            kw2.append (k.split()[2].strip())
    
    kw = kw+kw2
        
    return ", ".join(kw)

#extract_keys_for_row(startup['Txt'][4],10)

In [12]:
startup['Keywords'] = startup['Txt'].apply(lambda x: extract_keys_for_row(x,10))

In [13]:
tags = pd.read_excel("Keywords to Tags V5.xlsx", encoding="utf8", error_bad_lines=False)
tags = tags.replace(np.nan, '', regex=True)
tags = tags[~(tags['Tags'].str.len() == 0)]
del(tags['Tag 1'])
del(tags['Tag 2'])
del(tags['Tag 3'])

In [14]:
keywords = startup[['Nome','Keywords']]
#keywords.head()

In [15]:
words = pd.concat([pd.Series(row['Nome'], row['Keywords'].split(', ')) 
           for _, row in keywords.iterrows()]).reset_index().rename(columns={"index": "Words", 0: "Nome"})
words.sort_values(by=['Words'], inplace=True)

In [16]:
def list_to_csv(text):
    str_list = ", " . join(text)

    str_list = str_list.split(", ")
    str_list = list(map(str.strip, str_list))
    str_list = list(filter(None, str_list))
    str_list = list(set(str_list))
    str_list = ", " . join(str_list)
    
    return str_list
    
#list_to_csv(res['ComputedTags'][8])

In [17]:
res = tags.merge(words, on="Words", how='left')
res = res.groupby('Nome').agg(lambda x: x.tolist()).rename({'Tags': 'ComputedTags'}, axis=1)
res['ComputedTags'] = res['ComputedTags'].apply(lambda x: list_to_csv(x))

In [18]:
startup = startup.merge(res, on="Nome", how='left')
startup = startup.replace(np.nan, '', regex=True)

In [19]:
del(startup['Keywords'])
del(startup['Words'])

In [29]:
startup

Unnamed: 0,Nome,Text,ComputedTags,Predicted Trend
0,CERCAOFFICINA,cercaofficina a portal to compare quotes for r...,"Vertical: Automotive, Application: Maintenance...",Future of Mobility
1,CAREPY,carepy created digital lassistente for medicat...,"Vertical: Pharmacy, Vertical: Health & Medicine",Exponential Medicine & Digital Health
2,AMBIENSVR,ambiensvr a startup that develops projects or ...,"Tag: Startup, Tech: Virtual Reality, Vertical:...",Restaurants and Future of Food and Beverage
3,COCOAPP,A software for handling reservations for beach,Tag: Software,Restaurants and Future of Food and Beverage
4,FLYFREE AIRWAYS,Enable to share private flights for small busi...,"Tag: Venture Capital, Vertical: Travel and Tou...",Restaurants and Future of Food and Beverage
...,...,...,...,...
86,FINDMYLOST,platform that allows you to regain your lost i...,"Tag: Platform, Vertical: Travel and Tourism, V...",Restaurants and Future of Food and Beverage
87,GLASS TO POWER,glass to power a spin off of the university of...,"Tag: Technology, Tag: University, Vertical: En...",Decarbonization & Transition to Renewable & Ne...
88,SKOUTY,platform to organize outdoor activities accomp...,"Tag: Platform, Vertical: Sports",Restaurants and Future of Food and Beverage
89,EMERGE,platform that allows buyers from all over the ...,"Tag: Platform, Vertical: Food & Beverage, Loca...",Restaurants and Future of Food and Beverage


## Find Tech Trend

In [21]:
X = tfidfconverter.transform(startup['Txt']).toarray()
predicted_trends = model.predict(X)

In [22]:
p_series = pd.Series(predicted_trends)
startup = pd.concat([startup,p_series], axis=1).rename(columns={0: "Predicted Trend"})

In [23]:
del(startup['Txt'])

In [24]:
startup = startup[~(startup['ComputedTags'].str.len() == 0)]

In [25]:
startup.head(10)

Unnamed: 0,Nome,Text,ComputedTags,Predicted Trend
0,CERCAOFFICINA,cercaofficina a portal to compare quotes for r...,"Vertical: Automotive, Application: Maintenance...",Future of Mobility
1,CAREPY,carepy created digital lassistente for medicat...,"Vertical: Pharmacy, Vertical: Health & Medicine",Exponential Medicine & Digital Health
2,AMBIENSVR,ambiensvr a startup that develops projects or ...,"Tag: Startup, Tech: Virtual Reality, Vertical:...",Restaurants and Future of Food and Beverage
3,COCOAPP,A software for handling reservations for beach,Tag: Software,Restaurants and Future of Food and Beverage
4,FLYFREE AIRWAYS,Enable to share private flights for small busi...,"Tag: Venture Capital, Vertical: Travel and Tou...",Restaurants and Future of Food and Beverage
5,FINDMYLOST,Innovative platform that aims to improving the...,"Tag: Platform, Vertical: Travel and Tourism, V...",Restaurants and Future of Food and Beverage
6,GOVOLT,Scooter sharing conceived and designed to be f...,"Tag: Sustainability, Application: Sharing Econ...",Restaurants and Future of Food and Beverage
7,EVJA,IoT irrigation system based on artificial inte...,"Tech: Artificial Intelligence, Vertical: Agric...",Restaurants and Future of Food and Beverage
8,BIOVECBLOCK,biovecblock the start-up that develops natural...,"Vertical: Health & Medicine, Tag: Disease",Restaurants and Future of Food and Beverage
9,EDILMAG,edilmag the platform for the sharing of machi...,"Tag: Platform, Application: Sharing Economy, T...",Restaurants and Future of Food and Beverage
