In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np
import os
import re
import csv

In [2]:
# The file "Clusters.csv" is a result of applying LDA and NMF to the origial articles. 
# Each line contains year, method (LDA or NMF), number of cluster, and features (10 words that describe the cluster)

df = pd.read_csv("Clusters/Clusters.csv")

df1 = df.groupby("Year")['Features'].apply(lambda x: ' '.join(x)).reset_index()
documents = df1.Features
years = df1.Year

In [3]:
df1

Unnamed: 0,Year,Features
0,2006,google site users search web video service lik...
1,2007,like ll video new don game time know music goo...
2,2008,game like ll video games don time new know goo...
3,2009,like video ll don time know new good people re...
4,2010,company million funding round capital raised c...
5,2011,company million companies social funding start...
6,2012,company million startup companies data new fun...
7,2013,company million data companies said funding st...
8,2014,company million data companies funding round s...
9,2015,company million companies data said startup bu...


In [4]:
stopwords = ['account','autonomous','announced','ar','available','based','billion','board','build','business']
stopwords += ['buy','buzz','calif','calls','capital','case','cell','ceo','check','china','click','companies']
stopwords += ['comments','company','computer','conference','connected','contest','controller','couple']
stopwords += ['credit','daily','david','deal','deals','dev','disrupt','don','drm','drug','drugs','ds','ea','email']
stopwords += ['enterprise','event','experience','experiences','founder','files','follow','francisco','free']
stopwords += ['funding','gamesbeat','going','got','government','growth','halo','http','inch','industry']
stopwords += ['investors','ipo','japan','jobs','john','know','launch','life','like','links','list','little','ll']
stopwords += ['make','lot','market','mercury','million','money','mr','need','new','news','offer','office','oil']
stopwords += ['operating','partners','patent','patents','pay','percent','people','president','post','posted']
stopwords += ['power','price','privacy','product','project','psp','public','quarter','raised','really']
stopwords += ['release','results','revenue','right','round','said','sales','san','says','seattle','service']
stopwords += ['services','silicon','site','sites','smart','software','steve','stock','store','street','students']
stopwords += ['super','team','tech','things','think','time','tips','title','today','trump','uk','update']
stopwords += ['user','users','valley','vehicle','vehicles','venture','ventures','version','want','way','work']
stopwords += ['world','www','year','years','york','zuckerberg','developer','developers']
stopwords += ['drive','driver','driving','drivers','good','technology','startup','startups']

companies = ['amazon','apple','digg','facebook','friendfeed','foursquare','google','groupon','hp','htc','myspace']
companies += ['instagram','intel','microsoft','netflix','nexus','nintendo','nokia','nsa','oculus','palm','samsung']
companies += ['skype','slack','snapchat','sony','spotify','sprint','techcrunch']
companies += ['tesla','twitter','uber','venturebeat','verizon','vista','yahoo','youtube','zynga','zune']

print('stopwords',len(stopwords))
print("companies' names", len(companies))

stopwords 170
companies' names 38


In [18]:
def lemmatization(word):
    if (word == 'ad') | (word == 'ads'):
        word = 'advertising'
    elif (word == 'app') | (word == 'apps') | (word == 'applications'):
        word = 'application'
    elif word == 'bots':
        word = 'bot'
    elif word == "cars":
        word = 'car'
    elif word == 'devices':
        word = 'device'
    elif  word == 'intelligence':
        word = 'ai'
    elif (word == 'gamers') | (word == 'gaming') | (word == 'games'):
        word = 'game'
    elif word == 'phones':
        word = 'phone'
    elif (word == 'machine') | (word == 'learning'):
        word = 'ml'
    elif (word == 'played') | (word == 'playing'):
        word = 'play'
    elif (word == 'players'):
        word = 'player'
    elif word == 'videos':
        word = 'video'
    elif (word == 'virtual') | (word =='reality'):
        word = 'vr'

    return word

In [6]:
# To build vizualization for both technology names and companies' names

def removeStopwords(wordlist):
    return [w for w in wordlist if w not in stopwords]

In [7]:
# To build vizualization for only technology names

def removeStopwordsANDCompanies(wordlist):
    SWandCo = stopwords + companies
    return [w for w in wordlist if w not in SWandCo]

In [8]:
# To build vizualization for only companies' names

def Companies(wordlist):
    return [w for w in wordlist if w in companies]

In [9]:
# Calculating the total number of words in df1 to normalize word count across all years

def total_num_words(opt):
    
    unique_words_total = []

    for row in range(len(df1)):

        year = df1.iloc[row][0]

        wordlist = df1.iloc[row][1].split()

        if opt == 1:
            wordlist = removeStopwords(wordlist)
        elif opt == 2:
            wordlist = removeStopwordsANDCompanies(wordlist)
        elif opt == 3:
            wordlist = Companies(wordlist)
        
        unique_words_total.append(wordlist) 
    
    unique_words_total = sum(unique_words_total,[])
    num_words_total = len(unique_words_total)
    unique_words_total = list(set(unique_words_total))
    num_unique_words_total = len(list(set(unique_words_total)))
    print("Total number of unique words (counted 1 time) in all clusters",num_unique_words_total)
    print("Total number of unique words (counted as many times as they are used) in all clusters",num_words_total)
    print("Unique words 2",unique_words_total)
    return num_words_total, sorted(unique_words_total)

In [19]:
# This function generates strings to be copied into TechInnovationsD3.html file to build visualization
# The foramt is  { "Tech":"phone","Date":"2005","Value":1000 }

def data_for_D3(opt, num_words_total, unique_words_total):
    df_D3_bubble = pd.DataFrame(columns = ('Year','D3_data'))
    df_D3_bubble_ = pd.DataFrame(columns = ('Year','D3_data'))
    
    fieldnames = [['date']]
    fieldnames.append(unique_words_total)
    fieldnames = sum(fieldnames,[])
    df_D3_line = pd.DataFrame(columns = fieldnames)
    
    df_D3_bubble1 = pd.DataFrame(columns = ('id','value'))

    for row in range(len(df1)):

        year = df1.iloc[row][0]
        print("--- Year",year) 
        
        df_D3_bubble1.append({'id':year,'value':""}, ignore_index=True)
    
        wordlist = df1.iloc[row][1].split()
        
        if opt == 1:
            wordlist = removeStopwords(wordlist)         
        elif opt == 2:
            wordlist = removeStopwordsANDCompanies(wordlist)
        elif opt == 3:
            wordlist = Companies(wordlist)
        
        wordfreq = []
        for w in wordlist:
            wordfreq.append(wordlist.count(w))

        l = sorted(list(zip(wordlist, wordfreq)), key=lambda x: x[1], reverse = True)
        ll = list(set(l))
        lll = sorted(ll, key=lambda x: x[1], reverse=True)

        """
        I need the following format to insert into TechInnovationsD3.html file 
        { "Tech":"phone","Date":"2005","Value":1000 },
        
        """    
    
        top_words = 30
        for i in lll[:top_words]:
            normalized_word_count = i[1]/num_words_total
            D3_word = lemmatization(i[0])
            lll_D3 = '{ "Tech":' + '"' + D3_word + '"' + ',"Date":' + '"' + str(year) + '"' + ',"Value":'+ str(normalized_word_count) + ' },'
            # multiplying 'normalized_word_count' by 10,000 for the value to be seen in visualization
            lll_D3_ = '{ "Tech":' + '"' + D3_word + '"' + ',"Date":' + '"' + str(year) + '"' + ',"Value":'+ str(normalized_word_count*10000) + ' },'
            df_D3_bubble = df_D3_bubble.append({'Year':year,'D3_data':lll_D3}, ignore_index=True)
            df_D3_bubble_ = df_D3_bubble_.append({'Year':year,'D3_data':lll_D3_}, ignore_index=True)
            
            df_D3_line_date = str(year)+'0101'
            df_D3_line = df_D3_line.append({'date':df_D3_line_date, D3_word:i[1]}, ignore_index=True)
            
            id_D3 = str(year) + "." + D3_word
            df_D3_bubble1 = df_D3_bubble1.append({'id':id_D3,'value':i[1]}, ignore_index=True)
               
    df_D3_line_noNA = df_D3_line.fillna(0)
    print("df_D3_line_noNA",df_D3_line_noNA.shape)
    
    df_D3_bubble1 = df_D3_bubble1.groupby('id')['value'].sum().reset_index()
    
    df_D3_bubble.to_csv("Viz/TechInnovationsD3_opt"+str(opt)+".csv")
    df_D3_bubble_.to_csv("Viz/TechInnovationsD3_opt"+str(opt)+"_.csv")
    df_D3_line_noNA.to_csv("Viz/TechInnovationsD3_line_opt"+str(opt)+".csv", index = False)
    df_D3_bubble1.to_csv("Viz/TechInnovationsD3_bub1_opt"+str(opt)+".csv", index = False)

In [38]:
# Option 1: use all words that are present in clusters from Clusters.csv 
# including names of both technologies and companies

num_words_total = total_num_words(1)
data_for_D3(1,num_words_total)

--- Year 2006 lenght of the word set 74
--- Year 2007 lenght of the word set 81
--- Year 2008 lenght of the word set 80
--- Year 2009 lenght of the word set 80
--- Year 2010 lenght of the word set 78
--- Year 2011 lenght of the word set 66
--- Year 2012 lenght of the word set 70
--- Year 2013 lenght of the word set 71
--- Year 2014 lenght of the word set 66
--- Year 2015 lenght of the word set 65
--- Year 2016 lenght of the word set 82
--- Year 2017 lenght of the word set 78
Total number of words (unique features) in all clusters 891
--- Year 2006
--- Year 2007
--- Year 2008
--- Year 2009
--- Year 2010
--- Year 2011
--- Year 2012
--- Year 2013
--- Year 2014
--- Year 2015
--- Year 2016
--- Year 2017


In [20]:
# Option 2: remove stopwords and companies' names 

num_words_total,unique_words_total = total_num_words(2)
data_for_D3(2,num_words_total,unique_words_total)

Total number of unique words (counted 1 time) in all clusters 176
Total number of unique words (counted as many times as they are used) in all clusters 4475
Unique words 2 ['playing', 'handsets', 'windows', 'electric', 'ray', 'iphone', 'ad', 'movie', 'gaming', 'social', 'photo', 'sound', 'story', 'web', 'games', 'engine', 'device', 'nano', 'xbox', 'health', 'bot', 'human', 'content', 'learning', 'blu', 'alexa', 'ai', 'self', 'blog', 'playstation', 'os', 'rift', 'flash', 'sharing', 'chatbots', 'reader', 'messages', 'vr', 'use', 'tv', 'storage', 'chip', 'information', 'marketing', 'mac', 'touch', 'networks', 'music', 'glass', 'handset', 'android', 'tweet', 'chatbot', 'wireless', 'open', 'messenger', 'artificial', 'image', 'song', 'apps', 'crunchgear', 'gmail', 'home', 'phones', 'space', 'chat', 'cars', 'internet', 'tablets', 'applications', 'gamers', 'aws', 'ads', 'feature', 'download', 'fi', 'bots', 'digg', 'chrome', 'energy', 'app', 'videos', 'card', 'grid', 'streaming', 'page', 'scree

In [178]:
# Option 3: keep only compnaies' names to build vosualization for compnaies evolution 

num_words_total,unique_words_total = total_num_words(3)
data_for_D3(3,num_words_total,unique_words_total)

Total number of unique words (counted 1 time) in all clusters 38
Total number of unique words (counted as many times as they are used) in all clusters 1231
Unique words 2 ['yahoo', 'zune', 'venturebeat', 'sprint', 'techcrunch', 'vista', 'intel', 'samsung', 'nintendo', 'google', 'groupon', 'nsa', 'microsoft', 'friendfeed', 'hp', 'apple', 'facebook', 'myspace', 'snapchat', 'youtube', 'slack', 'oculus', 'tesla', 'zynga', 'spotify', 'foursquare', 'instagram', 'palm', 'netflix', 'nexus', 'htc', 'amazon', 'twitter', 'skype', 'verizon', 'uber', 'sony', 'nokia']
--- Year 2006
--- Year 2007
--- Year 2008
--- Year 2009
--- Year 2010
--- Year 2011
--- Year 2012
--- Year 2013
--- Year 2014
--- Year 2015
--- Year 2016
--- Year 2017
df_D3_line_noNA (163, 39)


In [125]:
import csv

with open('names.csv', 'w') as csvfile:
    words = ['aa','bb','cc']
    year = 2006
    fieldnames = [['year']]
    fieldnames.append(words)
    fieldnames = sum(fieldnames,[])
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    val = 0
    writer.writeheader()
    for i in words:
        writer.writerow({'year': year , i: val})
        val += 1

In [199]:
df_ = pd.DataFrame([['c',1,1],['c',2,2],['cc',4,5]],columns = ['aa','a','b'])

In [200]:
df_

Unnamed: 0,aa,a,b
0,c,1,1
1,c,2,2
2,cc,4,5


In [202]:
df_ = df_.groupby('aa')['a'].sum().reset_index()

In [203]:
df_

Unnamed: 0,aa,a
0,c,3
1,cc,4
