In [1]:
import pandas as pd
import string
import re
import os

In [2]:
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

# Setup nltk corpora path
nltk_path = os.sep.join([os.environ['HOME'], 'nltk_data'])
nltk.data.path.insert(0, nltk_path)

### Data preprocessing

In [128]:
def preprocess_series_text(data, nltk_path=nltk_path):
    """Perform preprocessing on a Pandas series
       including removal of alpha numerical words,
       punctuation removal, tokenization, and stop word removal."""
    
    # remove alpha numerical words and make lowercase
    alphanum_re = re.compile(r"""\w*\d\w*""")
    data = alphanum_re.sub("", data.strip().lower())

    # remove punctuation
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))    
    data = punc_re.sub(' ', data)

    translator = str.maketrans('', '', string.punctuation)
    data = data.translate(translator)

    # tokenize words    
    words = word_tokenize(data)
    
    # remove stop words
    sw = stopwords.words('english')
    sw.extend(["â€“",'–',"“","“off","“the","“you’ve","tv”",'v',"it’s","i’ve","“rss","”",'%',"â€™"])
    sw.extend(['january','february','march','april','may','june','july','august','september','october','november','december'])   
    sw.extend(["that’s","there’s'","yesterday","tomorrow","today","i’m","“if","here’s"])
    words = list(filter(lambda y: y not in sw, words))
    data = ' '.join(words) # we want to returm data as text. i.e. string
    
    nouns = [token for token, pos in pos_tag(words) if pos.startswith('N')]
    data_nouns = ' '.join(nouns)
    
    return data, data_nouns

### Combining files from multiple courses into one: a file per year of data

In [129]:
# The code below reads files scrapped from multiple sources, then combines data by year and saves it into a pkl file. 

years = ['2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017']

# Count the total number of articles from all sources
articles_total = 0

for year in years:
    
    texts = []
    dates = []
    blobs = []
    tags = []
    text_nouns = []
    df_tmp = pd.DataFrame()
    
    # The folder "Data_AllSource" contains 24 .csv files: 12 from TechCrunch website and 12 from Venturebeat website.
    # They are the result of scraping both sites year by year.
    # Their names contain the year, for example, "techcrunch2006.csv"
    
    for filename in os.listdir(os.getcwd()+"/Data_AllSources/"):
        
        if year in filename:
            
            with open("Data_AllSources/"+filename,'r') as f:

                df = pd.read_csv(f)
                print("File:",filename," number of lines:",len(df))
                articles_total = articles_total + len(df)

                dates.append(df['date'])        
                df['text'].fillna("", inplace=True)
                df['tags'].fillna("", inplace=True)

                for i in range(len(df)):

                    text_tmp = df['text'][i]
                    if isinstance(text_tmp, str):
                        text_tmp = text_tmp.replace('\n', ' ')
                    else:
                        text_tmp = str(text_tmp).replace('\n', ' ')

                    data, data_nouns = preprocess_series_text(text_tmp)
                    texts.append(data)
                    text_nouns.append(data_nouns)

                    # to use in Approach 1    
                    blob = TextBlob(data)
                    blob_nouns = list(blob.noun_phrases)
                    blobs.append(blob_nouns)

                    # to use in Approach 2
                    data_tags, data_tag_nouns = preprocess_series_text(df['tags'][i])
                    blob = TextBlob(data_tags)
                    blob_tags = list(blob.words)
                    tags.append(blob_tags)

        else:
            continue
            
    # writing texts and text_nouns from both files dated with the same year into one df        
    df_tmp['text'] = texts
    df_tmp['nouns'] = text_nouns
    df_tmp.to_pickle("Data_AllSources/PKLs/"+"pkl"+str(year)+".pkl")
print("Total number of articles:", articles_total)

File: techcrunch2006.csv  number of lines: 4562
File: venturebeat2006.csv  number of lines: 1607
File: techcrunch2007.csv  number of lines: 12463
File: venturebeat2007.csv  number of lines: 3268
File: techcrunch2008.csv  number of lines: 16576
File: venturebeat2008.csv  number of lines: 5455
File: techcrunch2009.csv  number of lines: 16234
File: venturebeat2009.csv  number of lines: 8886
File: techcrunch2010.csv  number of lines: 16139
File: venturebeat2010.csv  number of lines: 9605
File: techcrunch2011.csv  number of lines: 15265
File: venturebeat2011.csv  number of lines: 9197
File: techcrunch2012.csv  number of lines: 14432
File: venturebeat2012.csv  number of lines: 13366
File: techcrunch2013.csv  number of lines: 13764
File: venturebeat2013.csv  number of lines: 13504
File: techcrunch2014.csv  number of lines: 12928
File: venturebeat2014.csv  number of lines: 13828
File: techcrunch2015.csv  number of lines: 11974
File: venturebeat2015.csv  number of lines: 13809
File: techcrunch2

### Listed below are attempts to retrieve specific information, i.e. technology names, from the documents.

### Attempt 1: Use noun phrases from TextBlob.
#### Conclusion: the result looks noisy, the approach has been abandoned.

In [9]:
# blobs_allDocs is a list of all noun phrases in all articles, each of which may occur multiple times
blobs_allDocs = sum(blobs,[])
# blobs_vocab is a list of all noun phrases in all articles, each phrase occurs only once
blobs_vocab = list(set(blobs_allDocs))

# Converting a collection of articles to a matrix of token counts using predefined vocabulary built from the noun phrases
cv_noun_phrases = CountVectorizer(vocabulary=blobs_vocab)
# Noun phrases feature matrix
feature_matrix_noun_phrases = cv_noun_phrases.fit_transform(texts).toarray()

In [10]:
print("Noun phrases feature matrix size: ",feature_matrix_noun_phrases.shape)

mydict_noun_phrases = cv_noun_phrases.vocabulary_
print(dict(list(mydict_noun_phrases.items())[0:20]))

Noun phrases feature matrix size:  (6169, 114751)
{'loss leaders —': 0, 'img src https files wordpress com mac pros lcds': 1, 'contact lenses': 2, 'heavy duty home': 3, '“that” peripheral value village year trust': 4, 'netscape update': 5, 'they’ll building': 6, 'toaster amps workspace roundup ergonomic chairs solar': 7, 'successful investments companies': 8, 'ati radeons nvidia geforces': 9, 'black lung we’re': 10, 'sides we’d': 11, 'transaction cilion denis segota matthew bartus': 12, 'site jigsaw isn’t': 13, 'webmethods inc': 14, 'personal decision': 15, 'real estate agents brokers': 16, 'open beta': 17, 'industries family tradition': 18, 'selection choice quotes': 19}


### Attempt 2: Use the words in the field "tags" from the scraped files or from the last sentence in the articles that starts with "Tags:" if present.
#### Conclusion: tags are not consistent throughout the years, the approach has been abondoned.

In [11]:
# tags_allDocs is a list of tags from all articles, each of which may occur multiple times
tags_allDocs = sum(tags,[])
# tags_vocab is a list of all tags from all articles, each tag occurs only once
tags_vocab = list(set(tags_allDocs))

# Converting a collection of articles to a matrix of token counts using predefined vocabulary built from the tags
cv_tags = CountVectorizer(vocabulary=tags_vocab)
# Tags feature matrix
feature_matrix_tags = cv_tags.fit_transform(texts).toarray()

In [12]:
print("Tags feature matrix size: ",feature_matrix_tags.shape)

mydict_tags = cv_tags.vocabulary_
print(dict(list(mydict_tags.items())[0:20]))

Tags feature matrix size:  (6169, 1284)
{'raid': 0, 'roundup': 1, 'webaroo': 2, 'keys': 3, 'sina': 4, 'headset': 5, 'york': 6, 'alcohol': 7, 'heart': 8, 'twitter': 9, 'kids': 10, 'semantic': 11, 'wink': 12, 'freecharge': 13, 'home': 14, 'myspace': 15, 'coffee': 16, 'estimates': 17, 'trusted': 18, 'zigtag': 19}


### Attempt 3: Apply usual n-grams to noun-only texts.
#### Conclusion: bigrmas (attempt 3.2) and trigrmas (attempt 3.3) are not what I need, but unigrams (attempt 3.1) show the promise.

In [13]:
# Attepmpt 3.1: apply usual n-grams to noun-only texts, unigrams
cv_nouns_1 = CountVectorizer(ngram_range=(1,1)
#                            ,min_df=0.01 # playing with parameters
                           ,max_df=0.01
                          )
feature_matrix_nouns_1 = cv_nouns_1.fit_transform(text_nouns).toarray()

In [14]:
print("Only-noun unigram feature matrix size: ",feature_matrix_nouns_1.shape)

mydict_nouns_1 = cv_nouns_1.vocabulary_
print(dict(list(mydict_nouns_1.items())[0:20]))

Only-noun unigram feature matrix size:  (6169, 22145)
{'collaborative': 3720, 'api': 788, 'airset': 406, 'write': 21740, 'traction': 20073, 'lucas': 11474, 'gonze': 8095, 'webjay': 21284, 'burton': 2598, 'playlists': 14716, 'honolulu': 9025, 'hawaii': 8643, 'developement': 5172, 'excuses': 6536, 'diligence': 5301, 'congratulation': 4037, 'riya': 16538, 'recovery': 15955, 'fiasco': 6941, 'killer': 10607}


In [15]:
# Attempt 3.2: apply usual n-grams to noun-only texts, bigrams
cv_nouns_2 = CountVectorizer(ngram_range=(2,2)
                           ,min_df=0.01
#                            ,max_df=0.001 # playing with parameters
                          )
feature_matrix_nouns_2 = cv_nouns_2.fit_transform(text_nouns).toarray()

In [16]:
print("Only-noun bigram feature matrix size: ",feature_matrix_nouns_2.shape)

mydict_nouns_2 = cv_nouns_2.vocabulary_
print(dict(list(mydict_nouns_2.items())[0:20]))

Only-noun bigram feature matrix size:  (6169, 46)
{'cell phone': 3, 'they ve': 29, 'market share': 12, 'email address': 6, 'search engine': 22, 'search results': 23, 'home page': 7, 'venture capital': 31, 'page views': 16, 'business model': 0, 'silicon valley': 24, 'partners ventures': 18, 'they ll': 27, 'venture capitalists': 33, 'york times': 42, 'kleiner perkins': 11, 'mountain view': 15, 'we ve': 40, 'co founder': 4, 'we ll': 38}


In [17]:
# Attempt 3.3: apply usual n-grams to noun-only texts, trigrams
cv_nouns_3 = CountVectorizer(ngram_range=(3,3)
#                            ,min_df=0.01
#                            ,max_df=0.001
                          )
feature_matrix_nouns_3 = cv_nouns_3.fit_transform(text_nouns).toarray()

In [18]:
print("Only-noun bigram feature matrix size: ",feature_matrix_nouns_3.shape)

mydict_nouns_3 = cv_nouns_3.vocabulary_
print(dict(list(mydict_nouns_3.items())[0:20]))

Only-noun bigram feature matrix size:  (6169, 343293)
{'collaborative calendar contacts': 46989, 'calendar contacts application': 31611, 'contacts application api': 60400, 'application api integration': 11906, 'api integration verizon': 11036, 'integration verizon yesterday': 137341, 'verizon yesterday access': 316918, 'yesterday access calendar': 339969, 'access calendar contact': 596, 'calendar contact information': 31610, 'contact information verizon': 60335, 'information verizon cell': 135928, 'verizon cell phone': 316824, 'cell phone services': 38641, 'phone services needs': 206898, 'services needs businesses': 255723, 'needs businesses office': 181623, 'businesses office standard': 30019, 'office standard services': 189572, 'standard services airset': 274537}


### Attempt 4: Apply unsupervised clustering in hope to identify clusters and thus technologies.

#### DBSCAN did not produce any result

In [22]:
from sklearn.cluster import DBSCAN

In [23]:
db = DBSCAN(eps=1.0, min_samples=10).fit(feature_matrix_nouns_1)
labels = db.labels_
print(len(labels))
print(labels)

6169
[-1 -1 -1 ..., -1 -1 -1]


#### KMeans did produce the result, but the result didn't make it to the final presentation. KMeans proved to be time-consuming, the algorithms is located in a separate notebook *TechNews_KMeans.ipynb*.