# Collecting Insights from the Text Data
---
In this notebook I will extract the top n-grams from the comments I collected to see if I can gain some insights about the current state of the cryptocurrency space.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

plt.style.context('seaborn')
pd.options.display.max_rows = 50

In [2]:
X = pd.read_csv('./top_final.csv')
X2 = pd.read_csv('./sec_final.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
X2['Comment'] = X2['Comment'].astype(str)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect = TfidfVectorizer(ngram_range=(2,4), stop_words='english')

# Pulls all of titles text's into one giant string
summaries1 = "".join(X['Comment'])
ngrams_summaries1 = vect.build_analyzer()(summaries1)

top100 = Counter(ngrams_summaries1).most_common(100)

In [5]:
#Repeating the process for second comments
vect = TfidfVectorizer(ngram_range=(2,5), stop_words='english')
summaries1 = "".join(X2['Comment'])
ngrams_summaries1 = vect.build_analyzer()(summaries1)

sec100 = Counter(ngrams_summaries1).most_common(100)



In [14]:
#Filtering out urls and other undesirable n-grams
top_list =[]
for el in top100:
    top_list.append(el[0])

In [15]:
exclude = ['https www', 'reddit com', 'www reddit', 'www reddit com', 'https www reddit com', 'https www reddit', 'com cryptocurrency', 'reddit com cryptocurrency','https www reddit com cryptocurrency', 'http www reddit com', ]

In [16]:
top_list

[u'market cap',
 u'https www',
 u'long term',
 u'guys think',
 u'looks like',
 u'btc eth',
 u'feel like',
 u'alt coins',
 u'imgur com',
 u'https imgur com',
 u'https imgur',
 u'time buy',
 u'reddit com',
 u'good time',
 u'don know',
 u'short term',
 u'hey guys',
 u'www reddit',
 u'www reddit com',
 u'https www reddit com',
 u'https www reddit',
 u'days ago',
 u'new crypto',
 u'eth ltc',
 u'does know',
 u'don want',
 u'eth btc',
 u'twitter com',
 u'want buy',
 u'https twitter',
 u'https twitter com',
 u'bitcoin cash',
 u'sell wall',
 u'youtube com',
 u'youtube com watch',
 u'low market',
 u'com watch',
 u'buy eth',
 u'com cryptocurrency',
 u'www youtube com',
 u'www youtube',
 u'buy dip',
 u've seen',
 u'crypto market',
 u'just bought',
 u've got',
 u'www youtube com watch',
 u'low market cap',
 u'weeks ago',
 u'https www youtube',
 u'https www youtube com',
 u'reddit com cryptocurrency',
 u'just got',
 u'cryptocurrency comments',
 u'pump dump',
 u'reddit com cryptocurrency comments',
 

In [19]:
top100_filtered = [x for x in top100 if x[0] not in exclude]

In [20]:
top100_filtered

[(u'market cap', 1595),
 (u'long term', 1026),
 (u'guys think', 914),
 (u'looks like', 773),
 (u'btc eth', 676),
 (u'feel like', 643),
 (u'alt coins', 554),
 (u'imgur com', 490),
 (u'https imgur com', 475),
 (u'https imgur', 475),
 (u'time buy', 474),
 (u'good time', 452),
 (u'don know', 446),
 (u'short term', 439),
 (u'hey guys', 398),
 (u'days ago', 384),
 (u'new crypto', 338),
 (u'eth ltc', 337),
 (u'does know', 330),
 (u'don want', 327),
 (u'eth btc', 323),
 (u'twitter com', 316),
 (u'want buy', 311),
 (u'https twitter', 292),
 (u'https twitter com', 291),
 (u'bitcoin cash', 279),
 (u'sell wall', 271),
 (u'youtube com', 266),
 (u'youtube com watch', 256),
 (u'low market', 256),
 (u'com watch', 256),
 (u'buy eth', 253),
 (u'www youtube com', 245),
 (u'www youtube', 245),
 (u'buy dip', 245),
 (u've seen', 244),
 (u'crypto market', 244),
 (u'just bought', 238),
 (u've got', 237),
 (u'www youtube com watch', 237),
 (u'low market cap', 236),
 (u'weeks ago', 236),
 (u'https www youtube',

In [21]:
sec100_filtered = [x for x in sec100 if x[0] not in exclude]

In [22]:
sec100_filtered

[(u'market cap', 2426),
 (u'long term', 2338),
 (u'btc eth', 936),
 (u'short term', 894),
 (u'don know', 845),
 (u'don think', 806),
 (u'looks like', 650),
 (u'days ago', 538),
 (u'feel like', 468),
 (u'time buy', 459),
 (u'eth btc', 441),
 (u'pump dump', 434),
 (u'good luck', 430),
 (u'www reddit com cryptocurrency', 429),
 (u'lot people', 425),
 (u'make money', 420),
 (u'eth ltc', 399),
 (u'make sure', 378),
 (u'low market', 365),
 (u'alt coins', 361),
 (u'working product', 358),
 (u'term hold', 342),
 (u'good time', 338),
 (u'weeks ago', 328),
 (u'don want', 327),
 (u'low market cap', 321),
 (u'long time', 320),
 (u'http www', 318),
 (u'buy eth', 310),
 (u'just bought', 304),
 (u'long term hold', 304),
 (u'coinmarketcap com', 304),
 (u've got', 303),
 (u'couple days', 293),
 (u've seen', 285),
 (u'coins like', 283),
 (u'think ll', 282),
 (u'sounds like', 281),
 (u'just like', 275),
 (u'good idea', 265),
 (u'pretty good', 259),
 (u'high risk', 253),
 (u'buy sell', 252),
 (u'cryptocur