In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import tensorflow as tf


In [None]:
#this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in the Shared Google Drive
FOLDERNAME = 'Shared drives/CS 230 Project'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/{}'.format(FOLDERNAME))

%cd /content/drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/Shared drives/CS 230 Project


In [None]:
datapath = "datasets/C50/"
train_df = pd.read_csv(datapath + "C50train/C50_train_article.csv")
test_df = pd.read_csv(datapath + "C50test/C50_test_article.csv")


In [None]:
train_df.head()

Unnamed: 0,Text,Author_num
0,The Commerce Department will issue final rules...,0
1,The Clinton administration will modestly revis...,0
2,The Federal Communications Commission has tent...,0
3,A group of leading trademark specialists plans...,0
4,An influential Internet organisation has backe...,0


In [None]:

train_df.columns = ["Text", "Author"]
test_df.columns = ["Text", "Author"]

In [None]:
print(train_df.loc[0])

Text      The Commerce Department will issue final rules...
Author                                                    0
Name: 0, dtype: object


In [None]:
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

Unnamed: 0,Text,Author
1908,French President Jacques Chirac looked set on ...,38
747,Czech Prime Minister Vaclav Klaus said on Wedn...,13
1740,Authorities in central China's Anhui province ...,34
2408,While the U.S. auto industry celebrated its 10...,48
874,Czech President Vaclav Havel has been moved ou...,18
...,...,...
153,"One of China's most prominent dissidents, Wang...",3
1436,Russia's aluminium industry has roared back to...,28
167,China's Communist Party leader Jiang Zemin app...,3
1363,China has missed a golden opportunity to expor...,27


In [None]:
def df_column_switch(df, column1, column2):
    i = list(df.columns)
    a, b = i.index(column1), i.index(column2)
    i[b], i[a] = i[a], i[b]
    df = df[i]
    return df

In [None]:
train_df=df_column_switch(train_df, "Author", "Text")
test_df=df_column_switch(test_df, "Author", "Text")

In [None]:
print(train_df.loc[0])

Author                                                    0
Text      The Commerce Department will issue final rules...
Name: 0, dtype: object


In [None]:
def map_authors(data):
    authors = []
    author_mappings = {}
    for index, row in data.iterrows():
        name = row['Author']
        if name in authors:
            continue
        authors.append(name)
        
    authors.sort()
    for i in range(len(authors)):
        author_mappings[authors[i]] = i
    return author_mappings 

In [None]:
def reverse_mapping(author_mapping):
    reverse = {}
    for key in author_mapping.keys():
        reverse[author_mapping[key]] = key
    return reverse

In [None]:
author_mapping = map_authors(test_df)
reverse_mappings = reverse_mapping(author_mapping)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49}


In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_df['Text'])
test_vectors = count_vectorizer.transform(test_df["Text"])


In [None]:
def extract_features(data):
  features = np.zeros((len(data.index), 2))
  for index, row in data.iterrows():
      text = row['Text']
      features[index, 0] = len(text) / 5
      words = text.split()
      features[index, 1] = len(text) / len(words)
  return features


In [None]:
def get_mappings(data, author_mapping):
    authors = data['Author']
    mappings = [author_mapping[author] for author in authors]
    data['mappings'] = mappings
    return data['mappings']

In [None]:
test_targets = get_mappings(test_df, author_mapping)
train_targets = get_mappings(train_df, author_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
num_rows = len(test_df.index)

In [None]:
from keras.utils.np_utils import to_categorical
label_test = test_targets
label_train = train_targets
# one hot encode
encoded_label_test = to_categorical(label_test)
encoded_label_train = to_categorical(label_train)



In [None]:
from keras.regularizers import l2

In [None]:
ann = tf.keras.models.Sequential()

In [None]:
#ann.add(tf.keras.layers.Dense(units=100,  activation='relu'))

In [None]:
#ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [None]:
#ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(50, activation='softmax'))

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
ann.fit(train_vectors.todense(), encoded_label_train, batch_size = 32, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9493561e90>

In [None]:
y_pred = ann.predict(test_vectors.todense())
y_pred

array([[4.70265950e-04, 1.52371544e-03, 6.94151549e-03, ...,
        2.26091663e-03, 1.33644545e-03, 9.97471740e-04],
       [7.98826339e-04, 4.65633441e-03, 1.44232623e-03, ...,
        1.63703575e-03, 1.35718263e-03, 2.51419656e-02],
       [3.20480438e-04, 9.87154417e-05, 4.06250510e-05, ...,
        3.39150429e-04, 1.15329502e-04, 3.34686993e-05],
       ...,
       [1.59383053e-03, 4.40061885e-05, 1.31610577e-04, ...,
        5.91818569e-03, 1.39186438e-03, 2.94558617e-04],
       [3.85531621e-06, 6.54743693e-04, 9.14599840e-03, ...,
        4.71449603e-04, 3.23823770e-05, 8.93746328e-05],
       [1.45952180e-02, 4.72974591e-02, 1.04381805e-02, ...,
        1.83158868e-03, 9.99154802e-03, 2.51339190e-02]], dtype=float32)

In [None]:

y_output = np.zeros_like(y_pred)
y_output[np.arange(len(y_pred)), y_pred.argmax(1)] = 1

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
accuracy_score(encoded_label_test, y_output)

0.6948

In [None]:
def output_probs(reverse_mappings, output):
    probs = {}
    for i in range(len(output[0])):
        probs[reverse_mappings[i]] = round(output[0][i], 3)
    return probs


In [None]:
str = """
The U.S. Department of Agriculture said this represented the first confirmed case of highly pathogenic H7 avian influenza (HPAI) in commercial poultry in the United States this year. It is the first time HPAI has been found in Tennessee, the state government said.

Tyson, the biggest chicken meat producer in the United States, said in a statement it was working with Tennessee and federal officials to contain the virus by euthanizing the birds on the contract farm.
In 2014 and 2015, during a widespread outbreak of HPAI, the United States killed nearly 50 million birds, mostly egg-laying hens. The losses pushed U.S. egg prices to record highs and prompted trading partners to ban imports of American poultry, even though there was little infection then in the broiler industry.

No people were affected in that outbreak, which was primarily of the H5N2 strain. The risk of human infection in poultry outbreaks is low, although in China people have died this winter amid an outbreak of the H7N9 virus in birds.

The facility in Tennessee’s Lincoln County has been placed under quarantine, along with approximately 30 other poultry farms within a 6.2-mile (10 km) radius of the site, the state said. Other flocks in the quarantined area are being tested, it added.

Tyson, the USDA and the state did not name the facility involved. Tyson said that it did not expect disruptions to its chicken business.
The USDA should have more information by Monday evening about the particular strain of the virus involved, spokeswoman Donna Karlsons said by email.

HPAI bird flu was last found in a commercial turkey flock in Indiana in January 2016.

The USDA said it would inform the World Organization for Animal Health (OIE) and international trading partners of the outbreak.

The biggest traditional markets for U.S. chicken meat are Mexico and Canada, which introduced state or regional bans on U.S. broiler exports after the outbreak two years ago, and China, which imposed a national ban.

Tennessee’s broiler production is too small to rank it in the top five U.S. producing states but it is the third-largest generator of cash receipts in agriculture for the state.

In January, the USDA detected bird flu in a wild duck in Montana that appeared to match one of the strains found during the 2014 and 2015 outbreak.

The United States stepped up biosecurity measures aimed at preventing the spread of bird flu after the outbreak two years ago.

Tyson said precautions being taken include disinfecting all vehicles entering farms and banning all nonessential visitor access to contract farms.


In recent months, different strains of bird flu have been confirmed across Asia and in Europe. Authorities have culled millions of birds in affected areas to control the outbreaks.

France, which has the largest poultry flock in the European Union, has reported outbreaks of the highly contagious H5N8 bird flu virus. In South Korea, the rapid spread of the H5N6 strain of the virus has led to the country’s worst-ever outbreak of bird flu.
"""

In [None]:
str_vectorized = count_vectorizer.transform({str})

In [None]:
str_pred = ann.predict(str_vectorized)
str_pred

array([[1.6409713e-06, 1.2668604e-05, 1.1965750e-07, 2.6214395e-06,
        7.6080332e-06, 5.7978836e-05, 2.8710474e-07, 1.4883191e-06,
        5.4806424e-07, 5.6856658e-07, 1.2252617e-07, 9.6658441e-05,
        1.2663855e-07, 4.4233943e-06, 1.2797197e-05, 1.1982565e-06,
        6.2871715e-07, 5.4416045e-08, 5.7018533e-05, 6.7452536e-05,
        7.3292293e-04, 3.2734354e-06, 4.4523263e-06, 3.1457410e-05,
        1.1384408e-05, 1.6343555e-06, 3.0836657e-06, 9.9857581e-01,
        3.5891619e-05, 8.4840731e-06, 1.3027211e-06, 2.4528043e-05,
        2.7533076e-07, 3.3692144e-05, 4.4436215e-06, 3.4564200e-05,
        2.6142730e-05, 3.9268917e-07, 6.5175950e-06, 5.5513948e-07,
        3.5860362e-06, 3.8522558e-06, 5.1296399e-07, 5.2556770e-06,
        6.7708438e-06, 2.8483746e-06, 5.0145212e-07, 1.0986077e-05,
        2.0582861e-06, 9.6852491e-05]], dtype=float32)

In [None]:
str_output = np.zeros_like(str_pred)
str_output[np.arange(len(str_pred)), str_pred.argmax(1)] = 1

In [None]:
str_output

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]], dtype=float32)

In [None]:
print(str_output.shape)

(1, 50)


In [None]:
probs = output_probs(reverse_mappings, str_pred)
probs

{0: 0.0,
 1: 0.0,
 2: 0.0,
 3: 0.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 10: 0.0,
 11: 0.0,
 12: 0.0,
 13: 0.0,
 14: 0.0,
 15: 0.0,
 16: 0.0,
 17: 0.0,
 18: 0.0,
 19: 0.0,
 20: 0.001,
 21: 0.0,
 22: 0.0,
 23: 0.0,
 24: 0.0,
 25: 0.0,
 26: 0.0,
 27: 0.999,
 28: 0.0,
 29: 0.0,
 30: 0.0,
 31: 0.0,
 32: 0.0,
 33: 0.0,
 34: 0.0,
 35: 0.0,
 36: 0.0,
 37: 0.0,
 38: 0.0,
 39: 0.0,
 40: 0.0,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0}

In [None]:
prediction = output_probs(reverse_mappings, str_output)
max(prediction, key=prediction.get)

27