## Project 6

In [21]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import scipy
import requests
from imdbpie import Imdb
import nltk
import matplotlib.pyplot as plt
import urllib
from bs4 import BeautifulSoup
import nltk
import collections
import re
import csv
import psycopg2
%matplotlib inline

### Pre-Work: Write a problem statement 

## Part 1: Acquire the Data

#### 1. Connect to the IMDB API

In [22]:
imdb = Imdb()
imdb = Imdb(anonymize=True)
imdb = Imdb(cache=True)

#### 2. Query the top 250 rated movies in the database

In [23]:
movies = pd.DataFrame(imdb.top_250())

#### 3. Only select the top 25 movies and delete the uncessary rows

In [24]:
movies.sort_values(by='rating', axis=0, ascending=False)
movies = movies[:25]
movies.head()

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1677125,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1147488,9.2,tt0068646,The Godfather,feature,1972
2,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,784568,9.0,tt0071562,The Godfather: Part II,feature,1974
3,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1662425,9.0,tt0468569,The Dark Knight,feature,2008
4,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,858289,8.9,tt0108052,Schindler's List,feature,1993


#### 4. Write the Results to a csv

In [25]:
#movies.to_csv('../Assets/Project6/top25.csv')

## Part 2: Wrangle the text data

#### 1. Convert the listing identification numbers (tconst) from the first dataframe to a list

In [26]:
tconst = movies['tconst'].tolist()

#### 2. Scrape the reviews for the top 25 movies

*Hint*: Use a loop to scrape each page at once

In [27]:
# imdbpie has a function where you can get reviews of the 
# titles by specifying the title Id.
Text = []
Id = []
for i in tconst:
    reviews = imdb.get_title_reviews(i,max_results=15)
    for review in reviews:
        Id.append(i)
        Text.append(review.text)

In [28]:
# Makes all the reviews a DataFrame
revs = pd.DataFrame(Id)
revs['Text'] = Text
revs.columns=['Id','Text']

In [29]:
revs.head()

Unnamed: 0,Id,Text
0,tt0111161,Why do I want to write the 234th comment on Th...
1,tt0111161,"\nCan Hollywood, usually creating things for e..."
2,tt0111161,\nI have never seen such an amazing film since...
3,tt0111161,"In its Oscar year, Shawshank Redemption (writt..."
4,tt0111161,The reason I became a member of this database ...


#### 5. Tokenize the Output

In [30]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = [tokenizer.tokenize(i) for i in Text] #makes each word an individual string
tag = [nltk.pos_tag(i) for i in tokens] #places a tag on each word(verb,adjective,...)

#Filters the adjectives(JJ) into a list
adj = []
for i in (tag):
    for a,b in i:
        if b == "JJ":
            adj.append(a)
            
# counts the most common adjectives and returns the XX most common           
c = collections.Counter
count = c(adj)
common = [a for a,b in count.most_common(15)]
print 'The most common adjective are:\n {}'.format(common)

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer object, this accepts the most common 
# words and compares them to the Text, It returns a binary count  
vectorizer = CountVectorizer(vocabulary=common,binary=True) 

train_data_features = vectorizer.fit_transform(Text)

# convert the result to an array
train_data_features = train_data_features.toarray()

#Create a DataFrame
train_data = pd.DataFrame(train_data_features, columns=common)


The most common adjective are:
 [u'great', u'first', u'many', u'other', u'good', u'much', u'same', u'such', u'own', u'real', u'true', u'different', u'original', u'little', u'few']


#### 6. Convert to a Dataframe for Easy Viewing

In [35]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 15 columns):
great        375 non-null int64
first        375 non-null int64
many         375 non-null int64
other        375 non-null int64
good         375 non-null int64
much         375 non-null int64
same         375 non-null int64
such         375 non-null int64
own          375 non-null int64
real         375 non-null int64
true         375 non-null int64
different    375 non-null int64
original     375 non-null int64
little       375 non-null int64
few          375 non-null int64
dtypes: int64(15)
memory usage: 44.0 KB


####  7. Find the rows with the top five descriptive words

In [32]:
words = ('best', 'hope', 'love', 'beautiful', 'great')

#### 8. Write the results to a csv

#### 9. Repeat the process for the other top 24 titles

## Part 3: Combine Tables in PostgreSQL

#### 1. Import your two .csv data files into your Postgre Database as two different tables

For ease, we can call these table1 and table2

#### 2. Connect to database and query the joined set

#### 3. Join the two tables 

#### 4. Select the newly joined table and save two copies of the into dataframes

## Part 4: Parsing and Exploratory Data Analysis

#### 1. Rename the column headings

#### 2. Run a description of the data

#### 3. Visualize the Data

## Part 3: Build the Decision Tree

#### 1. What is our target attribute? 

#### 2. Prepare the data and define the training set

#### 2. Train the Model

#### 3. Set up test data and test the model

#### 5. Check the results

#### 6. What is overfitting and how are we at risk? 