Skip to content

Commit

Permalink
Merge branch 'stage' into production
Browse files Browse the repository at this point in the history
  • Loading branch information
moz-dfeller committed Feb 7, 2024
2 parents 3cfc99f + d43ca76 commit 93d9ddd
Show file tree
Hide file tree
Showing 58 changed files with 1,789 additions and 273 deletions.
2 changes: 1 addition & 1 deletion bundler/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ USER node
RUN mkdir -p /home/node/code && \
cd /home/node && \
curl https://sh.rustup.rs -sSf | sh -s -- -y && \
git clone --depth 1 -b release-v1.0.1 https://github.com/common-voice/CorporaCreator.git && \
git clone --depth 1 -b release-v1.1.0 https://github.com/common-voice/CorporaCreator.git && \
git clone --depth 1 -b release-v1.0.0 https://github.com/common-voice/mp3-duration-reporter.git && \
cd CorporaCreator && \
python3 setup.py install --user && \
Expand Down
10 changes: 9 additions & 1 deletion bundler/queries/bundleLocale.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ SELECT
clips.client_id,
path,
REPLACE(sentence, '\r\n', ' ') AS sentence,
COALESCE(sentence_domains.domain, '') AS sentence_domain,
COALESCE(SUM(votes.is_valid), 0) AS up_votes,
COALESCE(SUM(NOT votes.is_valid), 0) AS down_votes,
COALESCE(age, '') AS age,
Expand Down Expand Up @@ -54,6 +55,13 @@ FROM clips
LEFT JOIN ages ON demographics.age_id = ages.id
LEFT JOIN genders ON demographics.gender_id = genders.id
) demographics ON clips.id = demographics.clip_id
-- A subquery for sentence domains is faster than a full join
LEFT JOIN (
SELECT s.id as sentence_id, sd.domain
FROM sentences s
INNER JOIN sentence_metadata sm ON sm.sentence_id = s.id
INNER JOIN sentence_domains sd ON sm.domain_id = sd.id
) sentence_domains ON clips.original_sentence_id = sentence_domains.sentence_id
WHERE clips.created_at BETWEEN ? AND ?
AND locales.name = ?
GROUP BY clips.id
GROUP BY clips.id
55 changes: 1 addition & 54 deletions bundler/src/core/clips.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export const TSV_COLUMNS = [
'client_id',
'path',
'sentence',
'sentence_domain',
'up_votes',
'down_votes',
'age',
Expand Down Expand Up @@ -105,60 +106,6 @@ const transformClips = (isMinorityLanguage: boolean) =>
objectMode: true,
})

/**
* Downloads the clips as they come in and saves them in clips
* directory: `releaseName/locale/clips/`. Passes the unaltered result
* from the previous stream to the next.
*
* @remarks
*
* The stream is in object mode.
*/
const downloadClips = (releaseDirPath: string) => {
return new Transform({
transform(chunk: ClipRow, encoding, callback) {
const clipFilename = createClipFilename(chunk.locale, chunk.id)
const writeStream = fs.createWriteStream(
path.join(releaseDirPath, chunk.locale, 'clips', clipFilename),
)
streamDownloadFileFromBucket(CLIPS_BUCKET)(chunk.path)
.pipe(writeStream)
.on('finish', () => {
callback(null, chunk)
})
},
objectMode: true,
})
}

const checkClipForExistence = (releaseDirPath: string) => {
return new Transform({
transform(chunk: ClipRow, encoding, callback) {
const clipFilename = createClipFilename(chunk.locale, chunk.id)
if (
fs.existsSync(
path.join(releaseDirPath, chunk.locale, 'clips', clipFilename),
)
) {
callback()
} else {
pipe(
doesFileExistInBucket(CLIPS_BUCKET)(chunk.path),
TE.getOrElse(() => T.of(false)),
)().then(doesExist => {
if (doesExist) {
callback(null, chunk)
} else {
console.log(`Skipping file ${chunk.path}`)
callback()
}
})
}
},
objectMode: true,
})
}

const getPreviousReleaseClipDir = (locale: string, prevReleaseName: string) =>
path.join(getTmpDir(), prevReleaseName, locale, 'clips')

Expand Down
77 changes: 57 additions & 20 deletions bundler/src/core/stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,34 @@ type Age = {

type Gender = {
'': number
male: number
female: number
other: number
male_masculine: number
female_feminine: number
transgender: number
'non-binary': number
do_not_wish_to_say: number
}

type SentenceDomain = {
'': number
agriculture: number
automotive: number
finance: number
food_service_retail: number
general: number
healthcare: number
history_law_government: number
language_fundamentals: number
media_entertainment: number
nature_environment: number
news_current_affairs: number
technology_robotics: number
}

type Splits = {
accent: Accent
age: Age
gender: Gender
sentence_domain: SentenceDomain
}

type Locale = {
Expand Down Expand Up @@ -110,9 +129,26 @@ const createEmptyLocale = (): Locale => {
},
gender: {
'': 0,
male: 0,
female: 0,
other: 0,
male_masculine: 0,
female_feminine: 0,
transgender: 0,
'non-binary': 0,
do_not_wish_to_say: 0,
},
sentence_domain: {
'': 0,
agriculture: 0,
automotive: 0,
finance: 0,
food_service_retail: 0,
general: 0,
healthcare: 0,
history_law_government: 0,
language_fundamentals: 0,
media_entertainment: 0,
nature_environment: 0,
news_current_affairs: 0,
technology_robotics: 0,
},
},
users: 0,
Expand Down Expand Up @@ -196,6 +232,7 @@ const extractStatsFromClipsFile = (locale: string, releaseDirPath: string) =>
initialLocale.clips++
initialLocale.splits.age[data.age as keyof Age]++
initialLocale.splits.gender[data.gender as keyof Gender]++
initialLocale.splits.sentence_domain[data.sentence_domain as keyof SentenceDomain]++
})
.on('finish', () => {
stats.locales[locale] = initialLocale
Expand Down Expand Up @@ -233,24 +270,24 @@ const unitToHours = (
}
const calculateDurations =
(locale: string) =>
(totalDurationInMs: number) =>
(stats: Stats): Stats => {
const localeStats = stats.locales[locale]
const validClips = localeStats.buckets.validated
(totalDurationInMs: number) =>
(stats: Stats): Stats => {
const localeStats = stats.locales[locale]
const validClips = localeStats.buckets.validated

localeStats.duration = totalDurationInMs
localeStats.avgDurationSecs =
Math.round(localeStats.duration / localeStats.clips) / 1000
localeStats.validDurationSecs =
Math.round((localeStats.duration / localeStats.clips) * validClips) / 1000
localeStats.duration = totalDurationInMs
localeStats.avgDurationSecs =
Math.round(localeStats.duration / localeStats.clips) / 1000
localeStats.validDurationSecs =
Math.round((localeStats.duration / localeStats.clips) * validClips) / 1000

localeStats.totalHrs = unitToHours(localeStats.duration, 'ms', 2)
localeStats.validHrs = unitToHours(localeStats.validDurationSecs, 's', 2)
localeStats.totalHrs = unitToHours(localeStats.duration, 'ms', 2)
localeStats.validHrs = unitToHours(localeStats.validDurationSecs, 's', 2)

stats.locales[locale] = localeStats
stats.locales[locale] = localeStats

return stats
}
return stats
}

export const statsPipeline = (
locale: string,
Expand Down
3 changes: 2 additions & 1 deletion bundler/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ export type ClipRow = {
client_id: string
path: string
sentence: string
sentence_domain: string
up_votes: string
down_votes: string
age: string
Expand Down Expand Up @@ -39,4 +40,4 @@ export type AppEnv = Settings & {
releaseDirPath: string
clipsDirPath: string
releaseTarballsDirPath: string
}
}
3 changes: 3 additions & 0 deletions common/sentences.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import { SentenceDomain } from './taxonomies'

export type SentenceSubmission = {
sentence: string
source: string
localeId: number
localeName: string
domain?: SentenceDomain
}

export enum SentenceSubmissionError {
Expand Down
29 changes: 23 additions & 6 deletions common/taxonomies.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
export type TaxonomyToken = 'singlewordBenchmark' | 'covid19Spotter';
export type TaxonomyToken = 'singlewordBenchmark' | 'covid19Spotter'

export type TaxonomyType = {
name: string;
source: string;
locales: string[];
};
name: string
source: string
locales: string[]
}

export const taxonomies: { [key in TaxonomyToken]: TaxonomyType } = {
singlewordBenchmark: {
Expand All @@ -30,4 +30,21 @@ export const taxonomies: { [key in TaxonomyToken]: TaxonomyType } = {
source: 'du-covid-keywords',
locales: ['rw'],
},
};
}

export const sentenceDomains = [
'agriculture',
'automotive',
'finance',
'food_service_retail',
'general',
'healthcare',
'history_law_government',
'language_fundamentals',
'media_entertainment',
'nature_environment',
'news_current_affairs',
'technology_robotics',
] as const

export type SentenceDomain = typeof sentenceDomains[number]
10 changes: 10 additions & 0 deletions common/user-clients.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,13 @@ export type UserClient = {
custom_goals?: CustomGoal[];
enrollment?: Enrollment;
};

export type Gender = {
'': string;
male_masculine: string;
female_feminine: string;
intersex: string;
transgender: string;
'non-binary': string;
do_not_wish_to_say: string;
};
20 changes: 10 additions & 10 deletions docs/Sample Bulk Submission - Sheet1.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Sentence (mandatory) Source (mandatory) Additional rationale for open license (mandatory) Sentence Quality Assurance Feedback (optional) O = satisfactory sentence, X = unsatisfactory sentence
Six years have passed since I resolved on my present undertaking. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm My own submission, copyright waived O
During her illness, many arguments had been urged to persuade my mother to refrain from attending upon her. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm My own submission, copyright waived O
She died calmly; and her countenance expressed affection even in death. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm MCV CC0 waiver process - see legal form O
My cat is a strange little dude. Jessica Rose (self) MCV CC0 waiver process - see legal form O
I should have brought sunscreen. Jessica Rose (self) More than 100 years since publication O
Have you read the Doraemon comics yet? Jessica Rose (self) More than 100 years since publication O
Her don't like pizza. Jane Doe (self) My own submission, copyright waived X
The cat was sitin on the windowsill. Jane Doe (self) My own submission, copyright waived X
The 3 elephants were playing in the mud John Doe (self) My own submission, copyright waived X
Sentence (mandatory) Source (mandatory) Additional rationale for open license (mandatory) Sentence Quality Assurance Feedback (optional) O = satisfactory sentence, X = unsatisfactory sentence Domain (optional)
Six years have passed since I resolved on my present undertaking. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm My own submission, copyright waived O General
During her illness, many arguments had been urged to persuade my mother to refrain from attending upon her. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm My own submission, copyright waived O General
She died calmly; and her countenance expressed affection even in death. Frankenstien, Mary Shelly, 1818, https://www.gutenberg.org/files/42324/42324-h/42324-h.htm MCV CC0 waiver process - see legal form O General
My cat is a strange little dude. Jessica Rose (self) MCV CC0 waiver process - see legal form O
I should have brought sunscreen. Jessica Rose (self) More than 100 years since publication O General
Have you read the Doraemon comics yet? Jessica Rose (self) More than 100 years since publication O General
Her don't like pizza. Jane Doe (self) My own submission, copyright waived X
The cat was sitin on the windowsill. Jane Doe (self) My own submission, copyright waived X
The 3 elephants were playing in the mud John Doe (self) My own submission, copyright waived X
3 changes: 2 additions & 1 deletion server/src/api/sentences/handler/add-sentence-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ import { createPresentableError } from '../../../application/helper/error-helper
import { StatusCodes } from 'http-status-codes'

export default async (req: Request, res: Response) => {
const { sentence, localeId, localeName, source } = req.body
const { sentence, localeId, localeName, source, domain } = req.body

const command: AddSentenceCommand = {
clientId: req.client_id,
sentence: sentence,
localeId: localeId,
localeName: localeName,
source: source,
domain: domain
}

return pipe(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { AllowedSchema } from 'express-json-validator-middleware'
import { sentenceDomains } from 'common'

export const AddSentenceRequest: AllowedSchema = {
type: 'object',
Expand All @@ -17,6 +18,10 @@ export const AddSentenceRequest: AllowedSchema = {
localeName: {
type: 'string',
},
domain: {
type: 'string',
enum: [...sentenceDomains],
},
},
}

Expand Down
Loading

0 comments on commit 93d9ddd

Please sign in to comment.