In [100]:
import pandas as pd
import numpy as np

from nltk import word_tokenize, sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [4]:
with open('apple-computers.txt') as f:
    apple_computers = f.read()

In [5]:
with open('apple-fruit.txt') as f:
    apple_fruit = f.read()

In [14]:
apple_computers.replace('\n', ' ')[:100]

'Apple Inc. From Wikipedia, the free encyclopedia This article is about the technology company. For o'

In [30]:
doc = []
for line in open('apple-computers.txt', 'r'):
    doc.append(line)

In [46]:

apple_computer_list = [element.replace('\t', '') for element in apple_computers.split('\n') if element]

apple_fruit_list = [element.replace('\t', '') for element in apple_fruit.split('\n') if element]

In [47]:
len(apple_computer_list)

283

In [48]:
len(apple_fruit_list)

187

In [77]:
# Let's build a datafram where we assign apple_computer as 1 and apple_fruit as 0

df = pd.DataFrame(apple_computer_list + apple_fruit_list)

df = df.rename(columns = {df.columns[0]: 'text'})


In [78]:

df.loc[df.index[:len(apple_computer_list)], 'label'] = 1


df.loc[df.index[len(apple_computer_list):], 'label'] = 0


In [80]:
df.head()

Unnamed: 0,text,label
0,Apple Inc.,1.0
1,"From Wikipedia, the free encyclopedia",1.0
2,This article is about the technology company. ...,1.0
3,Page semi-protected,1.0
4,Coordinates: 37.33182°N 122.03118°W,1.0


In [89]:
df_train, df_validation = train_test_split(df, test_size=0.2, random_state=1)


In [86]:
df_train.shape

(376, 2)

In [87]:
df_test.shape

(94, 2)

In [90]:
X_train = df_train['text']
y_train = df_train['label']

X_validation = df_validation['text']
y_validation = df_validation['label']

In [91]:

vectorizer = TfidfVectorizer(stop_words= 'english')
vectorizer.fit(X_train)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [92]:
X_train_tfidf = vectorizer.transform(X_train)
X_validation_tfidf = vectorizer.transform(X_validation)

In [97]:
y_train = np.array(y_train).reshape(-1,1).ravel()
y_validation = np.array(y_validation).reshape(-1,1).ravel()


In [101]:
def training(classifier, X_train, X_validation, y_train, y_validation):
    model = classifier
    model.fit(X_train, y_train)
    y_validation_predict = model.predict(X_validation)
    report = classification_report(y_validation, y_validation_predict)
    print(report)
    return model
    

In [103]:
training(LogisticRegression(), X_train_tfidf, X_validation_tfidf, y_train, y_validation)

              precision    recall  f1-score   support

         0.0       1.00      0.68      0.81        40
         1.0       0.81      1.00      0.89        54

   micro avg       0.86      0.86      0.86        94
   macro avg       0.90      0.84      0.85        94
weighted avg       0.89      0.86      0.86        94





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [104]:
training(RandomForestClassifier(), X_train_tfidf, X_validation_tfidf, y_train, y_validation)

              precision    recall  f1-score   support

         0.0       0.92      0.55      0.69        40
         1.0       0.74      0.96      0.84        54

   micro avg       0.79      0.79      0.79        94
   macro avg       0.83      0.76      0.76        94
weighted avg       0.82      0.79      0.77        94





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [112]:
best_model = training(LogisticRegression(), X_train_tfidf, X_validation_tfidf, y_train, y_validation)

              precision    recall  f1-score   support

         0.0       1.00      0.68      0.81        40
         1.0       0.81      1.00      0.89        54

   micro avg       0.86      0.86      0.86        94
   macro avg       0.90      0.84      0.85        94
weighted avg       0.89      0.86      0.86        94



### Try it on the test data

In [107]:
doc = []
for line in open('input00.txt', 'r'):
    doc.append(line.replace('\n', '').replace('\t', ''))

In [109]:
doc = doc[1:]

In [124]:
doc

["Apple already plans to buy back $100 billion in shares, including $16 billion worth last quarter. Icahn probably pounded the dinner table he and Cook shared recently for their much-reported bread-breaking at Icahn's New York apartment. Apple's cash stash currently sits at a whopping $145 billion but only $43 billion is in the U.S., which is why Icahn wants to float bonds to cover a buy back.",
 'Fortunately, there are ���low-chill��� apple varieties for temperate climates. (Chilling hours are defined as nonconsecutive hours of winter temperatures below 45 degrees.) As a general guide, if you live on or near the coast, your garden gets only 100 to 200 chilling hours. Inland San Diego gardens get about 400 to 500 chilling hours ��� still considered ���low chill.���',
 'If this seems a bit like d��j�� vu, you���ll recall that Apple just held an event to unveil two new iPhone models ��� the 5c and 5s ��� back on September 10.',
 '���Both Samsung and Apple are important contributors to th

In [123]:
for query in doc:
    query_tfidf = vectorizer.transform([query])
    label = best_model.predict(query_tfidf)
    if label == 1:
        print('computer-company')
    else:
        print('fruit')

computer-company
computer-company
computer-company
computer-company
computer-company
computer-company
computer-company
fruit
fruit
computer-company


In [125]:
# Combine

import pandas as pd
import numpy as np

from nltk import word_tokenize, sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

with open('apple-computers.txt') as f:
    apple_computers = f.read()
    
with open('apple-fruit.txt') as f:
    apple_fruit = f.read()


apple_computer_list = [element.replace('\t', '') for element in apple_computers.split('\n') if element]

apple_fruit_list = [element.replace('\t', '') for element in apple_fruit.split('\n') if element]

# Let's build a datafram where we assign apple_computer as 1 and apple_fruit as 0

df = pd.DataFrame(apple_computer_list + apple_fruit_list)

df = df.rename(columns = {df.columns[0]: 'text'})


df.loc[df.index[:len(apple_computer_list)], 'label'] = 1


df.loc[df.index[len(apple_computer_list):], 'label'] = 0

df_train, df_validation = train_test_split(df, test_size=0.2, random_state=1)


X_train = df_train['text']
y_train = df_train['label']

X_validation = df_validation['text']
y_validation = df_validation['label']


vectorizer = TfidfVectorizer(stop_words= 'english')
vectorizer.fit(X_train)

X_train_tfidf = vectorizer.transform(X_train)
X_validation_tfidf = vectorizer.transform(X_validation)

y_train = np.array(y_train).reshape(-1,1).ravel()
y_validation = np.array(y_validation).reshape(-1,1).ravel()

def training(classifier, X_train, X_validation, y_train, y_validation):
    model = classifier
    model.fit(X_train, y_train)
    y_validation_predict = model.predict(X_validation)
    report = classification_report(y_validation, y_validation_predict)
#     print(report)
    return model

best_model = training(LogisticRegression(), X_train_tfidf, X_validation_tfidf, y_train, y_validation)

doc = []
for line in open('input00.txt', 'r'):
    doc.append(line.replace('\n', '').replace('\t', ''))
    
doc = doc[1:]

for query in doc:
    query_tfidf = vectorizer.transform([query])
    label = best_model.predict(query_tfidf)
    if label == 1:
        print('computer-company')
    else:
        print('fruit')


computer-company
computer-company
computer-company
computer-company
computer-company
computer-company
computer-company
fruit
fruit
computer-company


