# Exploración

In [None]:
!ls

In [None]:
!ls data

## Training

In [None]:
!ls data/training/

## Dataset: Acquisition 

In [None]:
!ls data/training/acq 

## Algún archivo

In [None]:
!cat data/training/acq/0000005

## Dataset: CPU

In [None]:
!ls data/training/cpu

## Buscar más Frecuentes

In [None]:
import os
trainingFiles = []
for root, dirs, files in os.walk("data/training"):
    for file in files:
        trainingFiles.append([root, file])
        
trainingFiles[0:10]

## Agrupar

In [None]:
import pandas as pd

In [None]:
labels = ['root', 'file']
df = pd.DataFrame.from_records(trainingFiles, columns=labels)

In [None]:
df[0:10]

In [None]:
df.describe()

## GroupBy

In [None]:
df.groupby('root').describe()

## Ordernar y sacar los 10
Los 11 en realidad, porque hay una categoría unkwown

In [None]:
largest = df.groupby('root').count().nlargest(11, columns="file")
largest

In [None]:
largest.describe()

## Iterate trough values:

In [None]:
for row in df.iterrows():
    print(row[0], row[1]["root"], row[1]["file"])
    break

In [None]:
for row in largest.iterrows():
    print(row[0], row[1]["file"])

## Refactor

In [None]:
path = "data/training"
def getAllFiles(path):
    allFiles = []
    for root, dirs, files in os.walk(path):
        for file in files:
            allFiles.append([root, file])
            
    return allFiles

In [None]:
def getTop10Categories(files):
    labels = ['root', 'file']
    df = pd.DataFrame.from_records(files, columns=labels)
    largest = df.groupby('root').count().nlargest(11, columns="file")
    return [[row[0], row[1]["file"]] for row in largest.iterrows() if "unknown" not in row[0]]

In [None]:
def getFilesInFolders(folderList):
    allFiles = []
    for folder in folderList:
        for root, dirs, files in os.walk(folder):
            for file in files:
                allFiles.append([root, file])
            
    return allFiles

In [None]:
def concatFiles(allFiles):
    return [str(r[0])+ "/" + str(r[1]) for r in allFiles]

## Preview

### All Files

In [None]:
trainingFiles = getAllFiles("data/training")
print(len(trainingFiles))
print(trainingFiles[0])

### Top 10 Categories

In [None]:
top10 = getTop10Categories(trainingFiles)
print(len(top10))
print(top10[0])

### Top 10 Folders

In [None]:
top10Folders = [f[0] for f in top10]
top10Folders

### Top 10 Files

In [None]:
top10Files = getFilesInFolders(top10Folders)
print(len(top10Files))
print("\n".join(concatFiles(top10Files)[::500]))

### Check 'earn' Length

In [None]:
earnList = [r for r in top10Files if "earn" in r[0]]
len(earnList)

In [None]:
"/".join()

# Eliminar signos de puntuación

In [None]:
testFile = os.path.join(trainingFiles[0][0], trainingFiles[0][1])
testFile

In [None]:
testFileOpen = open(testFile, "r")
testText = testFileOpen.read()
testFileOpen.close()
print(testText)

## Fuerza bruta

In [None]:
puntuacion = [",", ".", ";", ":", "<", ">", "-"]

In [None]:
sinPuntos = testText

for punto in puntuacion:
    sinPuntos = sinPuntos.replace(punto, "")
    
print(sinPuntos)

Problemas de este método:
- Tener todos los signos de puntuación en el arreglo
- Eficiencia

## Expresiones regulares

In [None]:
import re, string

In [None]:
sinPuntos = testText

regex = re.compile('[%s]' % re.escape(string.punctuation))
sinPuntos =  regex.sub('', sinPuntos)
    
print(sinPuntos)

## String Translate

In [None]:
string.punctuation

In [None]:
sinPuntos = testText
sinPuntos = sinPuntos.translate(str.maketrans('','', string.punctuation))
print(sinPuntos)

## Comparación

[Source](https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python)

```python
replace   : 28.4436721802
regex     : 6.86155414581
translate : 2.12455511093
```

## Eliminar números

In [None]:
sinNumeros = re.sub('\d', '', sinPuntos)
print(sinNumeros)

# Eliminar Stopwords

Al igual que en el caso anterior, podríamos crear un arreglo y eliminar los stopwords. Una vez más, tendríamos que construir (o [descargar](https://github.com/stopwords-iso/stopwords-en)) un listado de estas.

Para este caso, ocuparemos el [Natural Language Toolkit](http://www.nltk.org/) para hacer esto.

In [None]:
from nltk.corpus import stopwords

In [None]:
stop = set(stopwords.words('english'))
list(stop)[0:10]

In [None]:
sinStop = [i for i in sinNumeros.lower().split() if i not in stop]
print(" ".join(sinStop))

# Tokenize

In [None]:
from nltk import word_tokenize
import nltk
nltk.download('punkt')

In [None]:
tokenized = [i for i in word_tokenize(sinNumeros.lower()) if i not in stop] 

In [None]:
tokenized[0:20]

# Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [None]:
stemText = [porter.stem(i.lower()) for i in word_tokenize(sinNumeros) if i.lower() not in stop]

In [None]:
stemText[0:20]

## Refactor

In [None]:
def processDocument(document):
    sinPuntos = document.translate(str.maketrans('','', string.punctuation))
    sinNumeros = re.sub('\d', '', sinPuntos)
    stemText = [porter.stem(i.lower()) for i in word_tokenize(sinNumeros) if i.lower() not in stop]
    return stemText

In [1]:
def readProcessDocument(documentFilename):
    file = open(documentFilename)
    parsed = processDocument(file.read())
    file.close()
    return parsed

### Preview

In [None]:
processDocument(testText)[0:20]

# Tablas Term Frequency (TF)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer=processDocument)
print(vectorizer)

In [None]:
allFilesContent = [[r, open(r).read()] for r in concatFiles(top10Files)]
print(len(allFilesContent))
print(allFilesContent[0][0], allFilesContent[0][1])

In [None]:
trainVector = vectorizer.fit_transform([fc[1] for fc in allFilesContent])

In [None]:
trainVector.shape

In [None]:
print(trainVector)

In [None]:
print(trainVector.nnz)