In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code':
  os.chdir(r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


  set_matplotlib_formats(fig_format)




In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
documents = ['The sun is shining',
             'The weather is beautiful',
             'I enjoy going for walks',
             'I hate rainy days']

labels = ['positive', 'positive', 'negative', 'negative']

In [4]:
doc_train, doc_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=42)
print(doc_train)

['I hate rainy days', 'The sun is shining', 'I enjoy going for walks']


In [5]:
vectorizer = CountVectorizer()

In [6]:
x_train = vectorizer.fit_transform(doc_train)
print(x_train.todense())
print(vectorizer.vocabulary_)

i2w = {i:w for w,i in vectorizer.vocabulary_.items()}
doc = x_train.toarray().copy().astype(str)
for i in range(doc.shape[0]):
  for j in range(doc.shape[1]):
    if doc[i,j]=='1':
      doc[i,j] = str(i2w[j])
print(doc)

[[1 0 0 0 1 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 1 1 1 0]
 [0 1 1 1 0 0 0 0 0 0 1]]
{'hate': 4, 'rainy': 6, 'days': 0, 'the': 9, 'sun': 8, 'is': 5, 'shining': 7, 'enjoy': 1, 'going': 3, 'for': 2, 'walks': 10}
[['days' '0' '0' '0' 'hate' '0' 'rainy' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' 'is' '0' 'shining' 'sun' 'the' '0']
 ['0' 'enjoy' 'for' 'going' '0' '0' '0' '0' '0' '0' 'walks']]


In [7]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [8]:
x_test = vectorizer.transform(doc_test)
y_pred = model.predict(x_test)
print(doc_test)
print(x_test.todense())
doc = x_test.toarray().copy().astype(str)
for i in range(doc.shape[0]):
  for j in range(doc.shape[1]):
    if doc[i,j]=='1':
      doc[i,j] = str(i2w[j])
print(doc)
print(y_pred)

['The weather is beautiful']
[[0 0 0 0 0 1 0 0 0 1 0]]
[['0' '0' '0' '0' '0' 'is' '0' '0' '0' 'the' '0']]
['positive']


In [9]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0
