In [1]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause

#from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

t0 = time()
print("Loading dataset and extracting TF-IDF features...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data[:n_samples])
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Loading dataset and extracting TF-IDF features...
done in 1.523s.
Fitting the NMF model with n_samples=2000 and n_features=1000...
done in 3.116s.
Topic #0:
people think did time law government israel rights say case said make state true evidence don mr point gun let
()
Topic #1:
thanks know does mail advance hi info interested anybody email looking help appreciated card information list send post need video
()
Topic #2:
game team year games win play season players nhl toronto runs division flyers think goal hockey player won defense teams
()
Topic #3:
windows file dos using program use files window problem help os application running drivers version ms screen ftp available code
()
Topic #4:
edu soon com send university internet ftp mail mit information article pub cc mac hope email address contact blood program
()
Topic #5:
key chip clipper keys encryption government use public secure phone enforcement data nsa law doesn communications going security used encrypted
()
Topic #6:
car ne



In [79]:
import pandas as pd
import numpy as np
import time
import psycopg2
import pickle
import collections

ubuntu=True

con = None

if ubuntu:
    con = psycopg2.connect(database='mountainproject', password='kepler31', user='devin', port=5432, host='/var/run/postgresql/')
else:
    con = psycopg2.connect(database='mountainproject', user='User')
cur = con.cursor()
con.autocommit = True

In [44]:
q= '''
    SELECT climb_id, comment
    FROM comments;
    '''
cur.execute(q)
df = pd.DataFrame(cur.fetchall())
df.columns = ['climb_id', 'comment']
df.head()

Unnamed: 0,climb_id,comment
0,222,This is the most scary climb EVAH!!
1,105908771,"I like this route. Nice Rumney crimping, stra..."
2,105908771,Are you supposed to move out left to the arete...
3,105908771,"no quick clips anymore, bring yer ATC"
4,105908771,The anchors are just two glue-ins. No links. ...


In [125]:
df_collect = df.groupby(df.climb_id).apply(lambda x: x.sum()).reset_index(drop=True)

In [133]:
df_collect.iloc[4,1]

"This route is far from being a classic in my book. To me grunts are not classics. I have done this route and I have successfully avoided since. There are other routes on Inner Outlet that are better such as the Conn's Retable route on Inner Outlet. I would give this route zero stars or maybe a negative 3 starsI would argue this is a three star classic if you like squeeze chimneys, otherwise it is a 0 star nightmare.  This is one of the longest and most sustained pitches of moderate squeeze chimney I have ever been on.  Needles climbers venturing out to do long routes in the mountains, Yosemite, or the desert should test their mettle on this - climbing wide cracks is standard in most places. <br/> <br/>It certainly isn't as much fun to climb as Classic Crack however.  The best part is when it is over...Believe it or not this was one of the first trad climbs I ever led.  I had no idea what I was getting into.  I didn't own cams at the time and the small amount of gear that I did place w

In [143]:
n_samples = 2000
n_features = 1000
n_topics = 4
n_top_words = 10

vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.05, max_features=n_features,
                             stop_words='english')
test_tfidf = vectorizer.fit_transform(df_collect.comment)

nmf = NMF(n_components=n_topics, random_state=1).fit(test_tfidf)
feature_names = vectorizer.get_feature_names()

word_lists = []
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print topic.argsort().shape
    word_lists += [[feature_names[i]
                    for i in topic.argsort()[-n_top_words - 1:-1]]]
    
    print(" ".join(word_lists[topic_idx]))
    print()


Topic #0:
(349,)
like just href did http style amp route com class
()
Topic #1:
(349,)
really climbing right moves crack crux great good climb fun
()
Topic #2:
(349,)
traverse anchor p2 ledge crack rap pitches belay rope second
()
Topic #3:
(349,)
clipping right replaced left second route anchors clip anchor bolts
()


In [144]:


d = collections.defaultdict(int)
for topic in word_lists:
    for word in topic:
        d[word] += 1



unique_words = []
for i, word_list in enumerate(word_lists):
    words = []
    for word in word_list:
        if d[word]<2:
            words += [word]
    unique_words += [words]
for topic in unique_words:
    print " ".join(topic)

like just href did http style amp com class
really climbing moves crux great good climb fun
traverse p2 ledge rap pitches belay rope
clipping replaced left anchors clip bolts


In [145]:
vectorizer.stop_words_

{u'theseanchors',
 u'sonja',
 u'ampitheatre',
 u'woods',
 u'spiders',
 u'hanging',
 u'ultimatley',
 u'woody',
 u'comically',
 u'sequencial',
 u'jbwr',
 u'regularize',
 u'hennings',
 u'sprague',
 u'originality',
 u'caned',
 u'starsdid',
 u'powercam',
 u'roxclamantis',
 u'discribed',
 u'wikiloc',
 u'bollwerk',
 u'defficult',
 u'rawhide',
 u'politician',
 u'fractal',
 u'wooded',
 u'spacy',
 u'grueling',
 u'wooden',
 u'wednesday',
 u'crotch',
 u'stereotypical',
 u'barelyto',
 u'raaaahaaad',
 u'scrapes',
 u'270',
 u'beqt',
 u'deadheads',
 u'sustaining',
 u'scraped',
 u'errors',
 u'cooking',
 u'localized',
 u'numeral',
 u'succumb',
 u'sevens',
 u'widget',
 u'chins',
 u'gileadi',
 u'chine',
 u'schultze',
 u'formationabout',
 u'china',
 u'natured',
 u'kids',
 u'uriosite',
 u'20rock',
 u'ericandlucie',
 u'controversy',
 u'cleanthe',
 u'concurrance',
 u'spotty',
 u'climbes',
 u'e8',
 u'appropriately',
 u'projection',
 u'lengthed',
 u'rockbiter',
 u'lengthen',
 u'e5',
 u'cranium',
 u'stern',
 u'p

##Brainstorming
Logistic regression:
star guessing based on observed features.
Which observed features mean more stars.


recommend based on past ticks, ratings, and star inputs

Additional recommenders:
popularity for people looking for popular or non-popular routes

