<a href="https://colab.research.google.com/github/davidclizbe/datascience/blob/master/Clizbe_Day_74%2C_Lecture_1_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning: Text Classification Assignment

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = '/content/drive/My Drive/APNEWS/AP_News'
DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'
corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern = CAT_PATTERN)

### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [None]:
corpus.raw()



In [None]:
corpus.fileids()

['health/http-apnews-com-03bc406312384416843138b2b23dec14.txt',
 'health/http-apnews-com-063eecb9a73e43b5a47f24d5d072de89.txt',
 'health/http-apnews-com-07145c801cc64b9c9fde6150af0e79db.txt',
 'health/http-apnews-com-08e5c195bf04471e9c4a127abe831d91.txt',
 'health/http-apnews-com-0dc745e5b66a47328b0be32f3cc9b1a2.txt',
 'health/http-apnews-com-145566f8c16a4d26ac36a317bbd3a02d.txt',
 'health/http-apnews-com-17d84e14096b4647828bf07cea2f6656.txt',
 'health/http-apnews-com-1a65f77ea55c4576ab9a981ae7c65ec5.txt',
 'health/http-apnews-com-1e96d18f8caa454e99d3f489865f8ff7.txt',
 'health/http-apnews-com-262de117d42947649c1a1caa6f4f70e7.txt',
 'health/http-apnews-com-3556845f3ab74186a26ec6d10739f9ca.txt',
 'health/http-apnews-com-386cc4805cfc49098181d525c113b65b.txt',
 'health/http-apnews-com-3bde9035af4044d082859ab587cfba0b.txt',
 'health/http-apnews-com-48e4b58e0a2e401cb52c5b600255657b.txt',
 'health/http-apnews-com-49877aba863e4f5199d0a22d68966bcc.txt',
 'health/http-apnews-com-5276e79e3d8f4ee

In [None]:
len(corpus.fileids())

217

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

In [None]:
corpus.fileids()

['health/http-apnews-com-03bc406312384416843138b2b23dec14.txt',
 'health/http-apnews-com-063eecb9a73e43b5a47f24d5d072de89.txt',
 'health/http-apnews-com-07145c801cc64b9c9fde6150af0e79db.txt',
 'health/http-apnews-com-08e5c195bf04471e9c4a127abe831d91.txt',
 'health/http-apnews-com-0dc745e5b66a47328b0be32f3cc9b1a2.txt',
 'health/http-apnews-com-145566f8c16a4d26ac36a317bbd3a02d.txt',
 'health/http-apnews-com-17d84e14096b4647828bf07cea2f6656.txt',
 'health/http-apnews-com-1a65f77ea55c4576ab9a981ae7c65ec5.txt',
 'health/http-apnews-com-1e96d18f8caa454e99d3f489865f8ff7.txt',
 'health/http-apnews-com-262de117d42947649c1a1caa6f4f70e7.txt',
 'health/http-apnews-com-3556845f3ab74186a26ec6d10739f9ca.txt',
 'health/http-apnews-com-386cc4805cfc49098181d525c113b65b.txt',
 'health/http-apnews-com-3bde9035af4044d082859ab587cfba0b.txt',
 'health/http-apnews-com-48e4b58e0a2e401cb52c5b600255657b.txt',
 'health/http-apnews-com-49877aba863e4f5199d0a22d68966bcc.txt',
 'health/http-apnews-com-5276e79e3d8f4ee

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(docs):
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  for doc in docs:
    tokenized = word_tokenize(doc)

    cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower()))
                for token in tokenized
               if not token.lower() in stopwords.words('english')
               if token.isalpha()]

    untokenized = " ".join(cleaned)
    preprocessed.append(untokenized)
  return preprocessed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
preprocessed = preprocess(docs)

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [None]:
x_train, x_test, y_train, y_test = tts(preprocessed, categories, test_size= .3)
x_train

['yakima ap health offici say hepat outbreak yakima counti involv case yakima health district said thursday case involv peopl experienc homeless use illicit drug health district announc outbreak five case confirm yakima counti sinc various local agenc done outreach vaccin peopl virus affect liver spread contamin fece caus symptom like fever dark urin skin eye fatigu gastric issu peopl pas along eat drink taint food water sex juli washington state depart health announc hepat outbreak genet test found yakima counti hepat strain observ relat hepat outbreak spokan',
 'concord ap new hampshir depart revenu administr offer new system onlin taxpay overhaul technolog depart collect billion tax year expect transit complet end new revenu inform manag system granit tax connect onlin user portal launch taxpay includ pay meal rental nurs facil qualiti assess medicaid enhanc tax taxpay file tax electron schedul autom onlin payment along task onlin http',
 'fayettevill ark ap univers arkansa student 

In [None]:
x_test

['napl ap richest prize woman golf histori ride outcom sei young kim deliv ultim money putt even even know score kim tie lead go final hole sunday cme group tour championship nerv obvious miss four straight putt foot closer moment later south korean took place lpga tour histori kim made putt life birdi broke sharpli right cup victori charley hull million payoff mean lot know biggest purs woman golf histori said kim whose win third year incred honor hull made earn six shot behind go back nine tiburon golf club hull birdi five last seven hole includ last three birdi gave share lead right daniell kang made eagl putt came inch short close birdi tie kim fulli awar pressur felt realli nervous walk hole said like big deal tri play like practic round think would make comfort even realli nervous oblivi competit day kim thought nelli korda start one shot behind fell back pair cost drive left top crown back green kim figur would enough crowd cheer pump fist fought back tear later said realiz mean

In [None]:
y_train

['health',
 'tech',
 'health',
 'sports',
 'tech',
 'tech',
 'tech',
 'politics',
 'tech',
 'politics',
 'tech',
 'politics',
 'health',
 'tech',
 'health',
 'politics',
 'sports',
 'health',
 'sports',
 'politics',
 'health',
 'sports',
 'politics',
 'sports',
 'health',
 'politics',
 'sports',
 'politics',
 'tech',
 'tech',
 'health',
 'sports',
 'politics',
 'politics',
 'tech',
 'sports',
 'politics',
 'politics',
 'tech',
 'politics',
 'sports',
 'tech',
 'health',
 'health',
 'tech',
 'health',
 'tech',
 'health',
 'tech',
 'sports',
 'tech',
 'health',
 'politics',
 'tech',
 'health',
 'health',
 'sports',
 'health',
 'sports',
 'sports',
 'tech',
 'tech',
 'tech',
 'tech',
 'sports',
 'health',
 'health',
 'politics',
 'health',
 'health',
 'politics',
 'tech',
 'politics',
 'politics',
 'sports',
 'tech',
 'tech',
 'tech',
 'politics',
 'tech',
 'politics',
 'health',
 'sports',
 'tech',
 'politics',
 'sports',
 'politics',
 'politics',
 'sports',
 'health',
 'tech',
 'politic

In [None]:
y_test

['sports',
 'politics',
 'tech',
 'health',
 'health',
 'sports',
 'sports',
 'politics',
 'politics',
 'health',
 'sports',
 'tech',
 'politics',
 'sports',
 'politics',
 'sports',
 'health',
 'politics',
 'health',
 'sports',
 'sports',
 'politics',
 'sports',
 'sports',
 'sports',
 'sports',
 'politics',
 'health',
 'tech',
 'health',
 'health',
 'politics',
 'tech',
 'sports',
 'sports',
 'tech',
 'sports',
 'health',
 'politics',
 'health',
 'politics',
 'politics',
 'health',
 'health',
 'politics',
 'health',
 'health',
 'politics',
 'tech',
 'tech',
 'tech',
 'tech',
 'sports',
 'health',
 'health',
 'sports',
 'health',
 'tech',
 'sports',
 'politics',
 'sports',
 'health',
 'sports',
 'sports',
 'politics',
 'sports']

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [None]:
model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('rfc', RandomForestClassifier())
])

model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [None]:
from sklearn.metrics import classification_report

predictions = model.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      health       1.00      0.67      0.80        18
    politics       0.68      0.94      0.79        16
      sports       0.94      0.77      0.85        22
        tech       0.43      0.60      0.50        10

    accuracy                           0.76        66
   macro avg       0.76      0.74      0.73        66
weighted avg       0.82      0.76      0.77        66



### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [None]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'