In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import warnings
warnings.filterwarnings("ignore")

In [40]:
import os
os.chdir("/content/drive/MyDrive/Document Tagging")

In [41]:
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd


custom_words = ['is', 'are', 'have', 'can', 'will', 'do', 'i', 'you', 'he', 'she', 'it', 'we',
                'they', 'me', 'him', 'her', 'us', 'them', 'a', 'an', 'the', 'in', 'on', 'at', 'with', 'by',
                'for', 'of', 'to', 'from', 'and', 'or', 'but', 'so', 'yet', 'nor', 'null', 'nan', 'no', 'not',
                'not', 'very', 'always', 'never', 'too', 'often', 'sometimes', 'rarely', 'frequently', 'quickly', 'slowly',
                'easily', 'hardly', 'nearly', 'completely', 'always', 'never', 'generally', 'specifically', 'approximately', 'constantly',
                'occasionally', 'regularly', 'precisely', 'thoroughly']


nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## **Wiki10 Dataset:**

In [42]:
# read the wiki10_data:

wiki_data = pd.read_csv("wiki10_data.csv")
wiki_data.head()

Unnamed: 0,Title,Tags
0,Hermann Zapf,"['font', 'history', 'fonts', 'typography', 'de..."
1,Yakety Sax,"['music', 'wikipedia', 'humor', 'reference', '..."
2,Torx,"['tools', 'hardware', 'reference', 'wikipedia'..."
3,Tom Robbins,"['books', 'writer', 'people', 'wikipedia', 'wr..."
4,Amazon Web Services,"['amazon', 'webservices', 'aws', 'cloud', 'web..."


In [43]:
wiki_data['Tags'][0]

"['font', 'history', 'fonts', 'typography', 'designer', 'design', 'zapf', 'wikipedia', 'print', 'article', 'types', 'typedesigner', 'type', 'typo_fonts_designers', 'typographer', 'wiki', 'tipografia_tcc', 'zapfino', 'optima', 'herman', 'archive', 'palatino', 'palationo', 'research', 'read', 'se']"

In [44]:
# handle the data: missing values & drop duplicates


def handle_data(data):
  data.fillna(method='ffill', inplace=True) # fill the null values usng forward-fill method
  wiki_data.drop_duplicates(inplace=True) # remove duplicates
  return data


data = handle_data(data=wiki_data)
data.head()

Unnamed: 0,Title,Tags
0,Hermann Zapf,"['font', 'history', 'fonts', 'typography', 'de..."
1,Yakety Sax,"['music', 'wikipedia', 'humor', 'reference', '..."
2,Torx,"['tools', 'hardware', 'reference', 'wikipedia'..."
3,Tom Robbins,"['books', 'writer', 'people', 'wikipedia', 'wr..."
4,Amazon Web Services,"['amazon', 'webservices', 'aws', 'cloud', 'web..."


In [45]:
wiki_data.shape

(20760, 2)

In [46]:
wiki_data.isnull().sum()

Title    0
Tags     0
dtype: int64

In [47]:
# Separate X & Y:

def separate_x_y(data, y_col_name):
  Y = data[y_col_name]
  X = data.drop(y_col_name, axis=1)
  X = [item for sublist in X.values.tolist() for item in sublist ]


  return X, Y


X, Y = separate_x_y(data=wiki_data, y_col_name="Tags")

In [48]:
# Text-Preprocessing:

def text_preprocessing(data):
  sentences = data
  preprocessed_sentences = []

  for sentence in sentences:
    # lowering the sentences:
    text = sentence.lower()

    # remove punctuation:
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # remove the numbers:
    text = re.sub(r'\d+', '', text)

    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # word tokenization:
    tokens = word_tokenize(text)

    # remove the stop of words:
    words = [word for word in tokens if word not in custom_words]


    # get the pre-processed sentences:
    preprocessed_sentence = ' '.join(words)
    preprocessed_sentences.append(preprocessed_sentence)

  return preprocessed_sentences

In [49]:
titles = text_preprocessing(X)
Y__ = text_preprocessing(Y)


In [50]:
y_data_series = pd.Series(Y__)

# Split each string into a list of tags
tags_data = list(y_data_series.str.split())


In [51]:
tags_data[0:2]


[['font',
  'history',
  'fonts',
  'typography',
  'designer',
  'design',
  'zapf',
  'wikipedia',
  'print',
  'article',
  'types',
  'typedesigner',
  'type',
  'typofontsdesigners',
  'typographer',
  'wiki',
  'tipografiatcc',
  'zapfino',
  'optima',
  'herman',
  'archive',
  'palatino',
  'palationo',
  'research',
  'read',
  'se'],
 ['music',
  'wikipedia',
  'humor',
  'reference',
  'usa',
  'soundtracks',
  'saxo',
  'tema',
  'wikipediapage',
  'muusika',
  'yaketysax',
  'bennyhill',
  'benny',
  'depdepdebedebedepdedede',
  'hill',
  'musica']]

In [52]:

tags = [sublist[:2] for sublist in tags_data]
tags[0:10]


[['font', 'history'],
 ['music', 'wikipedia'],
 ['tools', 'hardware'],
 ['books', 'writer'],
 ['amazon', 'webservices'],
 ['personality', 'enfp'],
 ['cropcircles', 'ufo'],
 ['system', 'geodetic'],
 ['japanese', 'literature'],
 ['montserrat', 'masks']]

In [53]:
dd = {"Title":titles, "Tags":tags}
data = pd.DataFrame(dd)
data.head()


Unnamed: 0,Title,Tags
0,hermann zapf,"[font, history]"
1,yakety sax,"[music, wikipedia]"
2,torx,"[tools, hardware]"
3,tom robbins,"[books, writer]"
4,amazon web services,"[amazon, webservices]"


In [54]:
# Save into Clean_Data directory

data.to_csv("Clean_Data/wiki_data.csv", index=False)

In [55]:
# Load and read the data

df = pd.read_csv("Clean_Data/wiki_data.csv")
df.head()

Unnamed: 0,Title,Tags
0,hermann zapf,"['font', 'history']"
1,yakety sax,"['music', 'wikipedia']"
2,torx,"['tools', 'hardware']"
3,tom robbins,"['books', 'writer']"
4,amazon web services,"['amazon', 'webservices']"


In [56]:
df['Tags'][0]

"['font', 'history']"

## **auto_tagging Dataset:**

In [57]:
# read the auto_tagging_data.csv

auto_tagging_data = pd.read_csv("auto_tagging_data.csv")
auto_tagging_data.head()

Unnamed: 0,Title,Tags
0,The Two Cultures: statistics vs. machine learn...,['machine-learning']
1,Forecasting demographic census,['forecasting']
2,Bayesian and frequentist reasoning in plain En...,['bayesian']
3,What is the meaning of p values and t values i...,['hypothesis-testing' 't-test' 'p-value' 'inte...
4,Examples for teaching: Correlation does not me...,['correlation']


In [58]:
auto_tagging_data.shape

(76365, 2)

In [59]:
auto_tagging_data.isnull().sum()

Title    0
Tags     0
dtype: int64

In [60]:
# Separate X & Y:

def separate_x_y(data, y_col_name):
  Y = data[y_col_name]
  X = data.drop(y_col_name, axis=1)
  X = [item for sublist in X.values.tolist() for item in sublist ]
  return X, Y

X, Y = separate_x_y(data=auto_tagging_data, y_col_name="Tags")

In [61]:
# Text-Preprocessing:

def text_preprocessing(data):
  sentences = data
  preprocessed_sentences = []

  for sentence in sentences:
    # lowering the sentences:
    text = sentence.lower()

    # remove punctuation:
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # remove the numbers:
    text = re.sub(r'\d+', '', text)

    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # word tokenization:
    tokens = word_tokenize(text)

    # remove the stop of words:
    words = [word for word in tokens if word not in custom_words]


    # get the pre-processed sentences:
    preprocessed_sentence = ' '.join(words)
    preprocessed_sentences.append(preprocessed_sentence)

  return preprocessed_sentences


titles = text_preprocessing(X)
Y__ = text_preprocessing(Y)


In [62]:
y_data_series = pd.Series(Y__)

# Split each string into a list of tags
tags_data = list(y_data_series.str.split())


In [63]:
tags_data[0:10]


[['machinelearning'],
 ['forecasting'],
 ['bayesian'],
 ['hypothesistesting', 'ttest', 'pvalue', 'interpretation'],
 ['correlation'],
 ['nonparametric', 'survival'],
 ['timeseries'],
 ['datavisualization', 'references'],
 ['machinelearning'],
 ['references']]

In [64]:

tags = [sublist[:3] for sublist in tags_data]
tags[0:10]


[['machinelearning'],
 ['forecasting'],
 ['bayesian'],
 ['hypothesistesting', 'ttest', 'pvalue'],
 ['correlation'],
 ['nonparametric', 'survival'],
 ['timeseries'],
 ['datavisualization', 'references'],
 ['machinelearning'],
 ['references']]

In [65]:
dd = {"Title":titles, "Tags":tags}
data = pd.DataFrame(dd)
data.head()


Unnamed: 0,Title,Tags
0,two cultures statistics vs machine learning,[machinelearning]
1,forecasting demographic census,[forecasting]
2,bayesian frequentist reasoning plain english,[bayesian]
3,what meaning p values t values statistical tests,"[hypothesistesting, ttest, pvalue]"
4,examples teaching correlation does mean causation,[correlation]


In [74]:
# Save into Clean_Data directory

data.to_csv("Clean_Data/auto_tagged_data.csv", index=False)

In [75]:
# Load and read the data

df1 = pd.read_csv("Clean_Data/auto_tagged_data.csv")
df1.head()

Unnamed: 0,Title,Tags
0,two cultures statistics vs machine learning,['machinelearning']
1,forecasting demographic census,['forecasting']
2,bayesian frequentist reasoning plain english,['bayesian']
3,what meaning p values t values statistical tests,"['hypothesistesting', 'ttest', 'pvalue']"
4,examples teaching correlation does mean causation,['correlation']


In [76]:
df1['Tags'][3]

"['hypothesistesting', 'ttest', 'pvalue']"

## **Store the data to AWS S3 Bucket:**

In [None]:
!pip install boto3
!pip install s3fs

In [70]:
import boto3

In [71]:
acess_key = "Not available"
secret_key = "Not available"

s3 = boto3.resource(
    service_name='s3',
    region_name = 'us-east-1',
    aws_access_key_id = acess_key,
    aws_secret_access_key = secret_key
)



In [72]:
# print the s3 bucket name:

for bucket in s3.buckets.all():
  print(bucket.name)


doc-tag


In [77]:
# Upload the data files in S3 bucket:

s3.Bucket("doc-tag").upload_file(Filename="Clean_Data/wiki_data.csv", Key="wiki_data.csv")
s3.Bucket("doc-tag").upload_file(Filename="Clean_Data/auto_tagged_data.csv", Key="auto_tagged_data.csv")


In [78]:
for obj in s3.Bucket('doc-tag').objects.all():
    print(obj)


s3.ObjectSummary(bucket_name='doc-tag', key='auto_tagged_data.csv')
s3.ObjectSummary(bucket_name='doc-tag', key='wiki_data.csv')


In [79]:
# Load csv file from AWS S3 bucket


obj = s3.Bucket('doc-tag').Object('wiki_data.csv').get()
data_1 = pd.read_csv(obj['Body'])
data_1.head()

Unnamed: 0,Title,Tags
0,hermann zapf,"['font', 'history']"
1,yakety sax,"['music', 'wikipedia']"
2,torx,"['tools', 'hardware']"
3,tom robbins,"['books', 'writer']"
4,amazon web services,"['amazon', 'webservices']"


In [80]:
# Load csv file from AWS S3 bucket


obj = s3.Bucket('doc-tag').Object('auto_tagged_data.csv').get()
data_2 = pd.read_csv(obj['Body'])
data_2.head()

Unnamed: 0,Title,Tags
0,two cultures statistics vs machine learning,['machinelearning']
1,forecasting demographic census,['forecasting']
2,bayesian frequentist reasoning plain english,['bayesian']
3,what meaning p values t values statistical tests,"['hypothesistesting', 'ttest', 'pvalue']"
4,examples teaching correlation does mean causation,['correlation']
